def scraperwiki(): #zipcodes=get_zips(urlopen(URLS['zips'])) attach('us_zip_codes') zipcodes = [str(row['zip']) for row in select('zip from zipcodes')] #Skip zipcodes that are already finished. try: finished_zipcodes = [ row['zipcode'] for row in select('zipcode from finished_zipcodes') ] except: pass else: #print 'Already scraped these zipcodes:' for zipcode in finished_zipcodes: try: zipcodes.remove(zipcode) except ValueError: #The zipcodes database isn't complete pass for zipcode in zipcodes: print 'Scraping ' + zipcode lastpage = int(get_lastpage(search(zipcode, '1', save=False))) for page in [str(p) for p in range(1, lastpage + 1)]: theaters = get_theaters(zipcode, page, save=False) for theater in theaters: info = theater_info(theater) info = clean_info(info) save(['url'], info2dictRow(info, zipcode), 'locations') sleep(INTERVAL) save(['zipcode'], {'zipcode': zipcode}, 'finished_zipcodes')
def main(): """Check what has been scraped so far, then resume. It might be good to check for gaps in the scraping. Or maybe a recursive approach isn't the best for search pages like this.""" #What's already been scraped recently? if not 'directory' in show_tables(): last_searched=0 else: #Only skip things from the current scraper completion attempt. if 'scrape_completions' in show_tables(): raw_ids=select('scrape_ids from scrape_completions order by completion_id desc limit 1')[0]['scrape_ids'] max_to_ignore=max(map(int,raw_ids.split(','))) min_to_scrape=max_to_ignore+1 else: min_to_scrape=1 incomplete_scrape=select('max("search_id") as m from directory where scrape_id>='+str(min_to_scrape))[0]['m'] if incomplete_scrape!=None: last_searched=incomplete_scrape else: last_searched=0 if 'scrape_times' in show_tables(): last_id=select('max("scrape_id") as m from scrape_times')[0]['m'] else: last_id=0 #Time of scrape start scrape_id=last_id+1 save(['scrape_id'],{"scrape_id":scrape_id,"scrape_time":time()},'scrape_times') grab(last_searched+1,{"scrape_id":scrape_id},oncompletion=oncompletion)
def analyze(): d = select(""" `link-href`, GROUP_CONCAT(`author`) AS `authors`, count(*) AS "count" FROM `links` JOIN `topics` ON `links`.`topic-href` = `topics`.`topic-href` GROUP BY `link-href` """) execute('DROP TABLE IF EXISTS `wrote-about-same-things`') save([], d, 'wrote-about-same-things') print ''' These look most exciting because three different people wrote about each. 3 Kiana Fitzgerald,Sara Peralta,Susan Raybuck http://schedule.sxsw.com/2012/events/event_IAP100409 3 Shawn Dullye,Joe Vasquez,Sara Peralta http://schedule.sxsw.com/2012/events/event_IAP10593 3 Shawn Dullye,Kiana Fitzgerald,Sara Peralta http://schedule.sxsw.com/2012/events/event_IAP13848 Of course, that isn't adjusted for how many each person wrote. ''' d = select(""" author, count(*) AS `how-many` FROM `links` JOIN topics on links.`topic-href` = topics.`topic-href` GROUP BY author ORDER BY 2 DESC """) save(['author'], d, 'how-many-did-you-link') print """
def scraperwiki(): #zipcodes=get_zips(urlopen(URLS['zips'])) attach('us_zip_codes') zipcodes=[str(row['zip']) for row in select('zip from zipcodes')] #Skip zipcodes that are already finished. try: finished_zipcodes=[row['zipcode'] for row in select('zipcode from finished_zipcodes')] except: pass else: #print 'Already scraped these zipcodes:' for zipcode in finished_zipcodes: try: zipcodes.remove(zipcode) except ValueError: #The zipcodes database isn't complete pass for zipcode in zipcodes: print 'Scraping '+zipcode lastpage=int(get_lastpage(search(zipcode,'1',save=False))) for page in [str(p) for p in range(1,lastpage+1)]: theaters=get_theaters(zipcode,page,save=False) for theater in theaters: info=theater_info(theater) info=clean_info(info) save(['url'],info2dictRow(info,zipcode),'locations') sleep(INTERVAL) save(['zipcode'],{'zipcode':zipcode},'finished_zipcodes')
def find_similar_research(): research = select('url, plaintext as "value" from maincol where url != ?;', [reference_person]) research.extend( select('url, plaintext as "value" from maincol where url = ?;', [reference_person])) documents = [row['value'].strip() for row in research] stoplist = set('for a of the and to in'.split()) texts = [[ word for word in document.lower().split() if word not in stoplist ] for document in documents] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] vec = corpus[-1] #The person being compared to tfidf = models.TfidfModel(corpus) index = similarities.SparseMatrixSimilarity(tfidf[corpus]) sims = index[tfidf[vec]] print list(enumerate(sims)) save(['url'], [{ "url": row[0], "similarity": row[1][1] } for row in zip([row['url'] for row in research], list(enumerate(sims)))], 'similarity')
def test_wrapper(test_func): attach('scraperwiki_events_eventbrite_guestlists') original_data=select('* from `ny` where `Twitter Handle`="thomaslevine";') table_name=test_func(original_data) sleep(5) attach('ajax_tractor') ajaxed_data=select('* from `%s` where `Twitter Handle`="thomaslevine";' % table_name) print original_data,ajaxed_data print original_data==ajaxed_data
def go(): attach('new_mexico_state_audits') nodeIds=[row['nodeId'] for row in select('nodeId from nodeIds')] scraped_nodeIds=[row['nodeId'] for row in select('nodeId from opinions order by time_scraped')] #So you get different information from consecutive partial runs. for nodeId in scraped_nodeIds: nodeIds.remove(nodeId) if len(nodeIds)==0: nodeIds=scraped_nodeIds for nodeId in nodeIds: print 'Scraping node '+nodeId parse(nodeId)
def go(): attach('new_mexico_state_audits') nodeIds = [row['nodeId'] for row in select('nodeId from nodeIds')] scraped_nodeIds = [ row['nodeId'] for row in select('nodeId from opinions order by time_scraped') ] #So you get different information from consecutive partial runs. for nodeId in scraped_nodeIds: nodeIds.remove(nodeId) if len(nodeIds) == 0: nodeIds = scraped_nodeIds for nodeId in nodeIds: print 'Scraping node ' + nodeId parse(nodeId)
def get_scraper_state(): all_views=[row['value'] for row in select('value FROM views ORDER BY value', verbose=False)] if 'links' not in show_tables(): years_to_do=[row['value'] for row in select('value FROM years ORDER BY value', verbose=False)] remaining_views_this_year=all_views else: finished=select('max(view) as "view",year from links where year=(select max(year) from links)', verbose=False) years_to_do=[row['value'] for row in select('value FROM years WHERE value>"%s" ORDER BY value' % finished[0]['year'], verbose=False)] remaining_views_this_year=[row['value'] for row in select('value from views where value>"%s"' % finished[0]['view'], verbose=False)] del(finished) return { "all-views":all_views , "years-to-do":years_to_do , "remaining-views-this-year":remaining_views_this_year }
def join(): disclosures=select('Entity,upper(Entity) as "ENTITY" from disclosures where entity is not null') disclosures_cleaned=[{ "raw":row['Entity'] , "clean":remove_ny(row['ENTITY']).strip() } for row in disclosures] save([],disclosures_cleaned,'disclosures_cleaned') licenses=select('Vendor,upper(Vendor) as "VENDOR" from swdata where Vendor is not null') licenses_cleaned=[{ "raw":row['Vendor'] , "clean":remove_ny(row['VENDOR']).strip() } for row in licenses] save([],licenses_cleaned,'licenses_cleaned')
def geocode(): if "address" not in show_tables(): initialize() while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0: address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0] #print address if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0: d = all_services(address['address-input']) for row in d: row['address-input'] = address['address-input'] save([], d, 'geocode') params = (address['address-column'], address['address-input']) execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params ) commit()
def find_similar_research(): research = select('url, value from maincol where url != ?;', [reference_person]) research.extend(select('url, value from descriptions where url = ?;', [reference_person])) documents = [row['value'].strip() for row in research] stoplist = set('for a of the and to in'.split()) texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] vec = corpus.pop() #The person being compared to tfidf = models.TfidfModel(corpus) index = similarities.SparseMatrixSimilarity(tfidf[corpus]) sims = index[tfidf[vec]] save(['url'], [{"url": row[0], "similarity": row[1][0]} for row in zip([row['url'] for row in research], list(enumerate(sims)))], 'similarity')
def update_item(self, item, item_type, unique_keys, table_name): now = datetime.utcnow() where = ' and '.join([("%s = '%s'" % (ukey, item[ukey])) for ukey in unique_keys[item_type]]) sqlquery = "* from %s where %s" % (table_name, where) try: item_in_database = sqlite.select(sqlquery) saved_item = { key: value for key, value in item_in_database[0].iteritems() if value and key not in ['created', 'modified'] } # because the item to be saved doesn't have these fields current_item = { key: value for key, value in item.iteritems() if value } # ignoring blank/Nones etc. item['modified'] = (item_in_database[0]['modified'] if current_item == saved_item and item_in_database[0]['modified'] else now) item['created'] = (item_in_database[0]['created'] if item_in_database[0]['created'] else now) except (sqlite.SqliteError, IndexError, KeyError): item['created'] = item['modified'] = now return item
def swversion(table_name='swdata'): if table_name in show_tables(): timestamp = select("max(date_extracted) as m from %s;" % table_name)[0]['m'] execute("ALTER TABLE `%s` RENAME TO `%s_%d`;" % (table_name, table_name, timestamp)) commit()
def extract_postcodes(): sql = ' `rowid`, `address` from `branches`;' for row in select(sql): postcodes = findall(r'[0-9]{4}', row['address']) if len(postcodes) != 0: execute("UPDATE `branches` SET `postcode` = ? WHERE `rowid` = ? ", (postcodes[-1], row['rowid']) ) commit()
def get_page(url,table_name="pages"): if not table_name in show_tables(): raise PageNotSavedError(url) else: rows=select("`text` from %s where url=?" % table_name,[url]) l=len(rows) if l==0: raise PageNotSavedError(url) elif l>1: raise DatastoreError(url,"Multiple rows match this url.") elif l==1: if not 'text' in rows[0].keys(): raise DatastoreError(url,"The database does not have a `text` column.") else: return rows[0]['text'] #Tests #import unittest #class TestGetPage(unittest.TestCase): # def test_good_page(self): # url="https://scraperwiki.com/scrapers/dbgetpy/" # get_page(url) # row=select('* from `pages` where url=?',[url])[0] # assertEqual(set(row.keys()),set(["url","text"])) # assertIn("dbget=swimport('dbgetpy')",row['text']) #if __name__ == '__main__': # print "Running tests" # unittest.main() #else: # import os # print "Running from bash" # print os.execvp("python",["script.py"])
def main(): #What has already been scraped if 'contributions' in show_tables(): scraped=[row['querystring'] for row in select('querystring from contributions')] else: scraped=[] pagenumber=0 while True: pagenumber=pagenumber+1 xml=load(pagenumber) #Get the header row rows=xml.xpath('//table[@class="table_text"][tr[@class="tan_row"]]')[0].getchildren()[1:] keys=['name','contestant_party_district','date_received','class_and_partnum','association','monetary','non-monetary'] #Get the data rows ds=[] d={} for row in rows: cells=row.getchildren() contributor=cells.pop(0).getchildren()[0] d['querystring']=contributor.attrib['href'].replace("javascript:PopUp('contributor.aspx?",'').replace("', '300', '300');",'') d[keys[0]]=contributor.text for i in range(1,len(cells)): d[keys[i]]=cells[i].text ds.append(d) #Don't run again if already run if ds[0]['querystring'] in scraped: break else: save(['querystring'],ds,'contributions')
def pop(self): query = select('* from stack where rowid = (select max(rowid) from stack)') instantiate = "%s(%s)" % (query[0]['classname'], dumps(query[0]['url'])) print instantiate obj = eval(instantiate) justpopped = obj return obj
def oncompletion(): scrape_ids=[str(row['scrape_id']) for row in select('scrape_id from scrape_times')] if 'scrape_completions' in show_tables(): #Increment id completion_id=1+select('max("completion_id") as m from scrape_completions')[0]['m'] #Remove old scrape_ids completion_rows=[row['scrape_ids'] for row in select('scrape_ids from scrape_completions')] old_scrapes=(','.join(completion_rows)).split(',') for old_scrape in old_scrapes: scrape_ids.remove(old_scrape) else: completion_id=1 d={ "completion_id":completion_id , "scrape_ids":','.join(scrape_ids) } save(['completion_id'],d,'scrape_completions')
def pop(self): query = select( '* from stack where rowid = (select max(rowid) from stack)') instantiate = "%s(%s)" % (query[0]['classname'], dumps( query[0]['url'])) print instantiate obj = eval(instantiate) justpopped = obj return obj
def main(): rowid = int(select('max(id) as id from organic_operations')[0]['id']) print 'Starting on ' + str(rowid + 1) while True: rowid = rowid + 1 done = parse(rowid)['done'] #Python doesn't support recursion well because it does not support tail recursion elimination if done: break
def main(): rowid=int(select('max(id) as id from organic_operations')[0]['id']) print 'Starting on '+str(rowid+1) while True: rowid=rowid+1 done=parse(rowid)['done'] #Python doesn't support recursion well because it does not support tail recursion elimination if done: break
def step2(): urls = [ row['url'] for row in select( 'url from step2completion where browsed=0 limit 1456') ] #That seems to be near the CPU-time limit for url in urls: save_sidebar(url) #Then update step2completion execute('UPDATE step2completion SET browsed=1 WHERE url=?', url)
def last(self): # Query query = select('* from main.stack where rowid = (select max(rowid) from main.stack)') # Load instantiate = "%s(%s)" % (query[0]['classname'], '"""' + query[0]['url'] + '"""') print instantiate obj = eval(instantiate) return obj
def nextid(): defaultquery=[{"id":0}] if not OBS in show_tables(): idquery=defaultquery else: idquery=select('max(id) as id from %s' % OBS) if len(idquery)==0: idquery=defaultquery id=idquery[0]['id'] return id
def scrape(url,table_name="swdata", how_many = 10000): listurl=attendeelisturl(url) d=getattendeelist(listurl) d = getattendeelist(listurl + '&show_more=%d&sortid=0' % how_many) if table_name in show_tables(): scraped_so_far=select('count(*) as "c" from `%s`'%table_name)[0]['c'] saveattendeelist(d[0:-scraped_so_far],table_name) else: saveattendeelist(d,table_name)
def nextid(): defaultquery = [{"id": 0}] if not OBS in show_tables(): idquery = defaultquery else: idquery = select('max(id) as id from %s' % OBS) if len(idquery) == 0: idquery = defaultquery id = idquery[0]['id'] return id
def geocode(): if "scraped" not in show_tables(): d = swimport('csv2sw').read.csv('https://views.scraperwiki.com/run/combine_mix_scraper_spreadsheets/') save([], d, 'scraped') if "address" not in show_tables(): initialize() while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0: address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0] #print address if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0: d = all_services(address['address-input']) for row in d: row['address-input'] = address['address-input'] save([], d, 'geocode') params = (address['address-column'], address['address-input']) execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params ) commit()
def scrape(url, table_name="swdata", how_many=10000): listurl = attendeelisturl(url) d = getattendeelist(listurl) d = getattendeelist(listurl + '&show_more=%d&sortid=0' % how_many) if table_name in show_tables(): scraped_so_far = select('count(*) as "c" from `%s`' % table_name)[0]['c'] saveattendeelist(d[0:-scraped_so_far], table_name) else: saveattendeelist(d, table_name)
def parsenames(): d=select('`Lobbyist_Name` as "full_name" from `lobbyists`') for lobbyist in d: splitname=lobbyist['full_name'].split(', ') l=len(splitname) if l==2: lobbyist['last_name'],lobbyist['first_name']=splitname elif l==3: lobbyist['last_name'],lobbyist['suffix'],lobbyist['first_name']=splitname else: raise NameDelimiterError("This name has %d commas."%l-1) save([],d,'splitnames')
def main(): for url in getUrls(): slug=getScraperSlug(url) try: owners=getScraperOwners(slug) except: save(['url'],{"url":url},'errors') else: for owner in owners: save(['username'],{"username":owner},'users') save(['url'],{"url":url,"scraped":True},'urls') print 'Add bio html' if "`bio` TEXT" not in execute("SELECT sql FROM sqlite_master WHERE tbl_name = 'users' AND type = 'table'")['data'][0][0]: execute("ALTER TABLE `users` ADD COLUMN `bio` TEXT;") for username in getUsernames("bio"): bio=getUserProfile(username) save(['username'],{"username":username,"bio":bio},'users') print 'Add biotext' if "`biotext` TEXT" not in execute("SELECT sql FROM sqlite_master WHERE tbl_name = 'users' AND type = 'table'")['data'][0][0]: execute("ALTER TABLE `users` ADD COLUMN `biotext` TEXT;") for username in getUsernames("biotext"): bio=select('`bio` FROM `users` WHERE `username`=?',[username])[0]["bio"] biotext=getBioText(bio) save(['username'],{"username":username,"bio":bio,"biotext":biotext},'users') print 'Add code roles' if "`owns` INT" not in execute("SELECT sql FROM sqlite_master WHERE tbl_name = 'users' AND type = 'table'")['data'][0][0]: execute("ALTER TABLE `users` ADD COLUMN `owns` INT;") execute("ALTER TABLE `users` ADD COLUMN `edits` INT;") for username in getUsernames("owns"): d=getCodeRoles(username) execute("UPDATE `users` SET owns=?,edits=? WHERE username=?",[d["owns"],d["edits"],username]) commit() print 'Add title variation' if "`distinct_title_tokens_count` INT" not in execute("SELECT sql FROM sqlite_master WHERE tbl_name = 'users' AND type = 'table'")['data'][0][0]: execute("ALTER TABLE `users` ADD COLUMN `distinct_title_tokens_count` INT;") execute("ALTER TABLE `users` ADD COLUMN `title_tokens` TEXT;") for username in getUsernames("distinct_title_tokens_count"): json=getUserJSON(username) d=titleVariation(json) execute(""" UPDATE `users` SET distinct_title_tokens_count=?,title_tokens_count=?,title_tokens=? WHERE username=?; """,[d["distinct_count"],d["total_count"],d["text"],username] ) commit()
def check_identical_screenshot(image_base64): """Check whether there's an identical screenshot already saved""" #If,else to handle new tables if 'images' in show_tables(): identical_screenshot = select( 'screenshot_id from images where image="' + image_base64 + '" limit 1') else: identical_screenshot = [] if len(identical_screenshot) == 0: #No identical screenshot if 'images' in show_tables(): screenshot_id = select( 'max(screenshot_id) as id from images')[0]['id'] + 1 else: screenshot_id = 1 return (False, {"screenshot_id": screenshot_id, "image": image_base64}) elif len(identical_screenshot) == 1: return (True, identical_screenshot[0])
def check_identical_screenshot(image_base64): """Check whether there's an identical screenshot already saved""" #If,else to handle new tables if 'images' in show_tables(): identical_screenshot=select('screenshot_id from images where image="'+image_base64+'" limit 1') else: identical_screenshot=[] if len(identical_screenshot)==0: #No identical screenshot if 'images' in show_tables(): screenshot_id=select('max(screenshot_id) as id from images')[0]['id']+1 else: screenshot_id=1 return (False,{ "screenshot_id":screenshot_id , "image":image_base64 }) elif len(identical_screenshot)==1: return (True,identical_screenshot[0])
def geocode(): if "scraped" not in show_tables(): d = swimport('csv2sw').read.csv('http://hacks.thomaslevine.com/all.csv') save([], d, 'scraped') execute('DELETE FROM `scraped` WHERE `Country` != "South Africa"') commit() if "address" not in show_tables(): initialize() while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0: address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0] #print address if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0: d = all_services(address['address-input']) for row in d: row['address-input'] = address['address-input'] save([], d, 'geocode') params = (address['address-column'], address['address-input']) execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params ) commit()
def atomic(): if "client"==pagetype(get_var('previous_href')): table_names=CLIENT_TABLES elif "lobbyist"==pagetype(get_var('previous_href')): table_names=LOBBYIST_TABLES else: raise ResumeError('The type of the previous href, "%s", could not be determined.' % get_var('previous_href')) if "clients_urls" in show_tables(): sourceUrl=select('distinct sourceUrl as "s" from `clients_urls` where jobId=(select max(jobId) from `clients_urls`)')[0]['s'] for table_name in table_names: execute('DELETE FROM `%s` where jobId in (select jobId from clients_urls where sourceUrl="%s")' % (table_name,sourceUrl)) commit() return sourceUrl
def main(mode="initial"): end = 10000 # seems like most recent from http://www.historic.org.nz/en/TheRegister/RecentReg.aspx if mode == "initial": accessed = [p['place_id'] for p in store.select("place_id FROM _places_accessed")] #start = max(accessed) for ref in xrange(end): if ref not in accessed: print ref do_place(ref) elif mode == "crawl": start = 0 for ref in xrange(start, end): do_place(ref)
def go(number=1,pagetype="SCRAPERS"): foo=scrapepage(number,pagetype) is_end=('scraper_urls' in show_tables()) and (foo['lasturl'] in select('url from scraper_urls')) #Save after checking whether it's the end because that's how I check. save(['url'],foo['scraper_urls'],'scraper_urls') if foo['lastpage']: #End when we reach the last page print "I scraped all the scrapers!" elif is_end: #End when we reach page where a scraper has already been scraped print "I scraped all of the new scrapers!" else: go(number+1,pagetype)
def get_page(url,table_name="pages"): if not table_name in show_tables(): raise PageNotSavedError(url) else: rows=select("`text` from %s where url=?" % table_name,[url]) l=len(rows) if l==0: raise PageNotSavedError(url) elif l>1: raise DatastoreError(url,"Multiple rows match this url.") elif l==1: if not 'text' in rows[0].keys(): raise DatastoreError(url,"The database does not have a `text` column.") else: return rows[0]['text']
def main(): if not 'cities_done' in show_tables(): cities_done=[] else: cities_done=select('* from cities_done') for fromcity in CITIES_NY: for tocity in CITIES_NY: if fromcity==tocity: print 'Skipping within-%s route' % fromcity elif {"from":fromcity,"to":tocity} in cities_done: print 'Already scraped %s to %s' % (fromcity,tocity) else: grab(fromcity,"NY",tocity,"NY") save([],{"from":fromcity,"to":tocity},'cities_done')
def moreparsing_map(): "Map along the most recent results in the table (like a Couch map) and return a new one" d=select("* FROM `swdata` WHERE date_scraped=(SELECT max(date_scraped) from `swdata`);") for row in d: row['street-address'],row['postal-code']=splitAddress(row['Address_']) row['town']=extractTown(row['branchName']) if 'final' in show_tables(): execute('DROP TABLE `final`;') d_final = [] for row in d: if row['regionName'] not in ["Botswana", "Malawi", "Nambia"]: d_final.append(row) save([],d_final,'final')
def separate_addresses(): execute('DROP TABLE IF EXISTS final') commit() d=select('* from `initial`') for row in d: splitaddress=row['address'].split('\n') l=len(splitaddress) if l==3: row['street-address'],row['subtown'],row['town2']=splitaddress elif l==2: row['street-address'],row['subtown']=splitaddress else: raise AddressError row['street-address'] = row['street-address'].strip() row['address'] = strip_address(row['address']) save([],d,'final')
def separate_addresses(): execute('DROP TABLE IF EXISTS final') commit() d = select('* from `initial`') for row in d: splitaddress = row['address'].split('\n') l = len(splitaddress) if l == 3: row['street-address'], row['subtown'], row['town2'] = splitaddress elif l == 2: row['street-address'], row['subtown'] = splitaddress else: raise AddressError row['street-address'] = row['street-address'].strip() row['address'] = strip_address(row['address']) save([], d, 'final')
def cp1(): execute(''' CREATE TABLE IF NOT EXISTS `businessPremises` ( `date_scraped` REAL, `businessPremisesURL` TEXT, FOREIGN KEY (date_scraped, businessPremisesUrl) REFERENCES cp1(date_scraped, businessPremisesUrl) ) ''') if get_var('crashed') == 1: pagenum = select('max(pagenum) from cp1 where date_scraped = (select max(date_scraped) from cp1)')[0]['max(pagenum)'] print "Resuming from page %d" % pagenum p = Page('CP1') p = Page('CP1', s=p.s, pagenum=pagenum) else: print "Starting a new run" p = Page('CP1') while p.lastpage()==False: print "Beginning page %d" % p.pagenum tables=p.table().subtables() d = [] for table in tables: row = table.parse() row['businessPremisesURL'] = table.business_premises_url() try: business_premises_data, more_registrant_data = table.business_premises(p.s) except Exception, msg: print "Error on %s: msg" % table.business_premises_url() sleep(60) print "Trying again" business_premises_data, more_registrant_data = table.business_premises(p.s) row['date_scraped']=DATE row['pagenum']=p.pagenum row['url']=URL+"?page=%d"%p.pagenum row.update(more_registrant_data) save([], business_premises_data, 'businessPremises') save(['date_scraped', 'businessPremisesURL'],row,'cp1') sleep(1) save_var('crashed', 1) p=p.next25()
def parse(url, xml=None, suffix=''): if xml == None: xml = pull(url) print "Loading the page" scrapers = xml.xpath(PATH) for scraper in scrapers: if 'observations' in show_tables(): observation_id = select( 'max(observation_id) as id from observations')[0]['id'] + 1 else: observation_id = 1 identifiers = {"observation_id": observation_id} info = copy(identifiers) screenshot_identity = copy(identifiers) identifiers['time_scraped'] = time() identifiers['url'] = scraper.xpath('a')[0].attrib['href'] print "Extracting metadata" info['owner'], info['title'] = scraper.xpath('a/h4')[0].text.split( '/', 1) info['language'], info['type'] = re.split( r'[^a-zA-Z]+', scraper.xpath('a/span[@class="about"]')[0].text) info['created'] = scraper.xpath('a/span[@class="when"]')[0].text screenshot_identity['url'] = scraper.xpath('a/img')[0].attrib['src'] print "Checking whether I've already saved the screenshot" exists, image = check_identical_screenshot( getimage(screenshot_identity['url'])) if exists: #If I have, don't do anything with theimage print "Screenshot already saved" else: #If I haven't, save a new image print "Saving the new screenshot" image['observation_scraped_on'] = observation_id save(['observation_scraped_on', 'screenshot_id'], image, 'images') #Either way, link the observation to the saved image screenshot_identity['screenshot_id'] = image['screenshot_id'] save(['observation_id'], screenshot_identity, 'screenshot_identidies') #Save these at the end to avoid partial rows print "Saving" save(['observation_id'], info, 'homepage_metadata') save(['observation_id'], identifiers, 'observations')