def scraperwiki(): #zipcodes=get_zips(urlopen(URLS['zips'])) attach('us_zip_codes') zipcodes=[str(row['zip']) for row in select('zip from zipcodes')] #Skip zipcodes that are already finished. try: finished_zipcodes=[row['zipcode'] for row in select('zipcode from finished_zipcodes')] except: pass else: #print 'Already scraped these zipcodes:' for zipcode in finished_zipcodes: try: zipcodes.remove(zipcode) except ValueError: #The zipcodes database isn't complete pass for zipcode in zipcodes: print 'Scraping '+zipcode lastpage=int(get_lastpage(search(zipcode,'1',save=False))) for page in [str(p) for p in range(1,lastpage+1)]: theaters=get_theaters(zipcode,page,save=False) for theater in theaters: info=theater_info(theater) info=clean_info(info) save(['url'],info2dictRow(info,zipcode),'locations') sleep(INTERVAL) save(['zipcode'],{'zipcode':zipcode},'finished_zipcodes')
def scraperwiki(): #zipcodes=get_zips(urlopen(URLS['zips'])) attach('us_zip_codes') zipcodes = [str(row['zip']) for row in select('zip from zipcodes')] #Skip zipcodes that are already finished. try: finished_zipcodes = [ row['zipcode'] for row in select('zipcode from finished_zipcodes') ] except: pass else: #print 'Already scraped these zipcodes:' for zipcode in finished_zipcodes: try: zipcodes.remove(zipcode) except ValueError: #The zipcodes database isn't complete pass for zipcode in zipcodes: print 'Scraping ' + zipcode lastpage = int(get_lastpage(search(zipcode, '1', save=False))) for page in [str(p) for p in range(1, lastpage + 1)]: theaters = get_theaters(zipcode, page, save=False) for theater in theaters: info = theater_info(theater) info = clean_info(info) save(['url'], info2dictRow(info, zipcode), 'locations') sleep(INTERVAL) save(['zipcode'], {'zipcode': zipcode}, 'finished_zipcodes')
def go(): attach('new_mexico_state_audits') nodeIds=[row['nodeId'] for row in select('nodeId from nodeIds')] scraped_nodeIds=[row['nodeId'] for row in select('nodeId from opinions order by time_scraped')] #So you get different information from consecutive partial runs. for nodeId in scraped_nodeIds: nodeIds.remove(nodeId) if len(nodeIds)==0: nodeIds=scraped_nodeIds for nodeId in nodeIds: print 'Scraping node '+nodeId parse(nodeId)
def test_wrapper(test_func): attach('scraperwiki_events_eventbrite_guestlists') original_data=select('* from `ny` where `Twitter Handle`="thomaslevine";') table_name=test_func(original_data) sleep(5) attach('ajax_tractor') ajaxed_data=select('* from `%s` where `Twitter Handle`="thomaslevine";' % table_name) print original_data,ajaxed_data print original_data==ajaxed_data
def go(): attach('new_mexico_state_audits') nodeIds = [row['nodeId'] for row in select('nodeId from nodeIds')] scraped_nodeIds = [ row['nodeId'] for row in select('nodeId from opinions order by time_scraped') ] #So you get different information from consecutive partial runs. for nodeId in scraped_nodeIds: nodeIds.remove(nodeId) if len(nodeIds) == 0: nodeIds = scraped_nodeIds for nodeId in nodeIds: print 'Scraping node ' + nodeId parse(nodeId)
def copyUrlsDb(): attach('scraperwiki_scraper_urls') save(['url'],select('url,0 as "scraped" from scraper_urls'),'urls')
#!/usr/bin/env python from urllib2 import urlopen from lxml.html import fromstring from lxml.etree import tostring from scraperwiki.sqlite import save,attach,select from BeautifulSoup import BeautifulSoup attach('alafia_network') pages=select('* from html') def main(): for page in pages: number=page['page'] #Replace brs with a better delimeter html=page.pop('table').replace('<br />','|') text=''.join(BeautifulSoup(html).findAll(text=True)) for row in text.split('=-=-=-=-=-=-=-=-=-=-=-=-=-=-='): for column in row.split('|'): cell=column.split(':') if len(cell)==2: key,value=cell key=key.encode('ascii','ignore').replace(' ','') if '&#' in key: key=key.split('&#')[0] page[key]=value save(['page'],page,'microfinance_institutions') main()#!/usr/bin/env python from urllib2 import urlopen
import scraperwiki import scraperwiki.sqlite as sql sql.attach("aeso_current_supply_demand_report", "aeso") sql.attach("alberta_wind_farms", "wind") data = sql.select("aeso.swdata.name, wind.swdata.lat, wind.swdata.lon, aeso.swdata.mc, aeso.swdata.tng, wind.swdata.turbineType FROM aeso.swdata LEFT JOIN wind.swdata ON aeso.swdata.name = wind.swdata.name") print data sql.save(unique_keys=["name"], data=data) import scraperwiki import scraperwiki.sqlite as sql sql.attach("aeso_current_supply_demand_report", "aeso") sql.attach("alberta_wind_farms", "wind") data = sql.select("aeso.swdata.name, wind.swdata.lat, wind.swdata.lon, aeso.swdata.mc, aeso.swdata.tng, wind.swdata.turbineType FROM aeso.swdata LEFT JOIN wind.swdata ON aeso.swdata.name = wind.swdata.name") print data sql.save(unique_keys=["name"], data=data)
# TO DO # * Add some error metric, probably based on latitude and longitude. # * Consider place type. # * Run more data. import operator from scraperwiki.sqlite import attach, select, get_var, save_var, save, execute, commit from urllib2 import urlopen from urllib import urlencode from json import loads, dumps attach('capitec_bank') from time import sleep from unidecode import unidecode GEOCODE_URL = 'http://open.mapquestapi.com/nominatim/v1/search?format=json&json_callback=&%s' COLUMNS = [ # Null # 'address', # Main effects # 'branchName', # 'provinceName', # Second-order # 'branchName || ", South Africa"', 'provinceName || ", South Africa"', # 'branchName || ", " || provinceName', # All # 'branchName || ", " || provinceName || ", South Africa"' ]
def initialize(): execute(""" CREATE TABLE `address` ( `address-column` text, `address-input` text, `finished` integer )""") execute("CREATE UNIQUE INDEX column ON `address` (`address-column`,`address-input`);") try: PREVIOUS_SCRAPER except: execute(""" CREATE TABLE `geocode` ( `address-geocode` TEXT, `latitude-geocode` REAL, `longitude-geocode` REAL, `number-of-matches` text, `service` TEXT, `address-input` TEXT )""") else: attach(PREVIOUS_SCRAPER) save([], select('* FROM `geocode`'), 'geocode') execute("CREATE UNIQUE INDEX geocode_key ON `geocode` (`address-input`, `service`);") execute(''' CREATE TABLE `branch_address` ( `address-column` TEXT, `address-input` TEXT, `entityRecord` INTEGER, FOREIGN KEY(entityRecord) REFERENCES scraped(rowid) )''') execute("CREATE UNIQUE INDEX branch_key ON `branch_address` (`entityRecord`, `address-column`);") commit() for column in COLUMNS: execute(''' INSERT INTO `address` ( `address-column`, `address-input`, `finished` ) SELECT DISTINCT ? as "address-column", %s as "address-input", 0 as "finished" FROM `scraped` ''' % COLUMNS[column], column) commit() execute(''' INSERT INTO `branch_address` ( `entityRecord`, `address-column`, `address-input` ) SELECT `rowid`, ? as "address-column", %s as "address-input" FROM `scraped` ''' % COLUMNS[column], column) commit() execute('DELETE FROM `address` WHERE `address-input` IS NULL') commit() execute(''' UPDATE address SET finished = 1 WHERE rowid in ( select address.rowid from `address` join `geocode` on address.`address-input` = geocode.`address-input` where service = "nominatim" ) ''') commit()
def initialize(): execute(""" CREATE TABLE `address` ( `address-column` text, `address-input` text, `finished` integer )""") execute("CREATE UNIQUE INDEX column ON `address` (`address-column`,`address-input`);") execute(""" CREATE TABLE `geocode` ( `address-geocode` TEXT, `latitude-geocode` REAL, `longitude-geocode` REAL, `number-of-matches` text, `service` TEXT, `address-input` TEXT )""") execute("CREATE UNIQUE INDEX geocode_key ON `geocode` (`address-input`, `service`);") execute(''' CREATE TABLE `branch_address` ( `address-column` TEXT, `address-input` TEXT, `entityRecord` INTEGER, FOREIGN KEY(entityRecord) REFERENCES scraped(rowid) )''') execute("CREATE UNIQUE INDEX branch_key ON `branch_address` (`entityRecord`, `address-column`);") commit() for column in COLUMNS: execute(''' INSERT INTO `address` ( `address-column`, `address-input`, `finished` ) SELECT DISTINCT ? as "address-column", %s as "address-input", 0 as "finished" FROM `scraped` ''' % COLUMNS[column], column) commit() execute(''' INSERT INTO `branch_address` ( `entityRecord`, `address-column`, `address-input` ) SELECT `rowid`, ? as "address-column", %s as "address-input" FROM `scraped` ''' % COLUMNS[column], column) commit() execute('DELETE FROM `address` WHERE `address-input` IS NULL') try: PREVIOUS_SCRAPER except: pass else: attach(PREVIOUS_SCRAPER) execute('INSERT INTO main.`geocode` SELECT * FROM `geocode`') commit()
def main(): attach('canada_elections_contributer_table') for qs in select('querystring from contributions where querystring not in (select querystring from contributors)'): popup(qs)
from scraperwiki.sqlite import attach, select, get_var, save_var, save, execute, commit, show_tables from urllib2 import urlopen from urllib import urlencode from json import loads, dumps from time import sleep from unidecode import unidecode from geopy import geocoders, distance attach("capitec_bank") COLUMNS = { # Full address 'full-address': 'address', # Town-country 'town-country': 'branchName || ", South Africa"', # Province-country 'province-country': 'provinceName || ", South Africa"', # Town-province-country 'town-province-country': 'branchName || ", " || provinceName || ", South Africa"', # Postcode-country 'postcode-country': 'postcode || ", " || "South Africa"' } def geocode(): if "address" not in show_tables(): initialize()
from cStringIO import StringIO from scraperwiki import dumpMessage, sqlite, utils sourcescraper = 'solid_state_drives' sqlite.attach(sourcescraper, "src") data = sqlite.select('* from src.swdata') from pprint import pprint import matplotlib.pyplot as plt import matplotlib.mlab as mlab xs, ys, ys_div = [], [], [] for element in data: if not element["size"]: continue xs.append(float(element["price"])) ys.append(float(element["size"][:-2])) ys_div.append(xs[-1] / ys[-1]) subplot = plt.subplot(211) plt.xlabel(u"Price (GBP)") plt.ylabel(u"Size (GB)") subplot.scatter(xs, ys) subplot = plt.subplot(212) plt.xlabel(u"Price (GBP)") plt.ylabel(u"Price / Size (GB)") subplot.scatter(xs, ys_div)
#from urllib import urlencode from json import loads from urllib2 import urlopen from scraperwiki.sqlite import attach, select attach('nyc_lobbyist_directory_browser') QUERIES = { "summary": """ SELECT sum(reimbursement) AS "Total Reimbursement" , sum(compensation) AS "Total Compensation" FROM `clients_details` WHERE `detailId`="Total" """, "completed": """ SELECT count(`href`) AS "completed" FROM `links` WHERE `href`<( SELECT `value_blob` FROM `swvariables` WHERE
import scraperwiki import scraperwiki.sqlite as sql sql.attach("aeso_current_supply_demand_report", "aeso") sql.attach("alberta_wind_farms", "wind") data = sql.select( "aeso.swdata.name, wind.swdata.lat, wind.swdata.lon, aeso.swdata.mc, aeso.swdata.tng, wind.swdata.turbineType FROM aeso.swdata LEFT JOIN wind.swdata ON aeso.swdata.name = wind.swdata.name" ) print data sql.save(unique_keys=["name"], data=data) import scraperwiki import scraperwiki.sqlite as sql sql.attach("aeso_current_supply_demand_report", "aeso") sql.attach("alberta_wind_farms", "wind") data = sql.select( "aeso.swdata.name, wind.swdata.lat, wind.swdata.lon, aeso.swdata.mc, aeso.swdata.tng, wind.swdata.turbineType FROM aeso.swdata LEFT JOIN wind.swdata ON aeso.swdata.name = wind.swdata.name" ) print data sql.save(unique_keys=["name"], data=data)
try: from scraperwiki.utils import httpresponseheader from scraperwiki.sqlite import attach, select except ImportError: def httpresponseheader(a, b): pass from lxml.html import fromstring from urllib2 import urlopen from time import time attach('combine_mix_scraper_spreadsheets_1') httpresponseheader("Content-Type", "text/csv") httpresponseheader("Content-Disposition", "attachment; filename=combined_spreadsheets.csv") print select('spreadsheet from combined_spreadsheets where time = (select max(time) from combined_spreadsheets)')[0]['spreadsheet']try: from scraperwiki.utils import httpresponseheader from scraperwiki.sqlite import attach, select except ImportError: def httpresponseheader(a, b): pass from lxml.html import fromstring from urllib2 import urlopen from time import time attach('combine_mix_scraper_spreadsheets_1') httpresponseheader("Content-Type", "text/csv") httpresponseheader("Content-Disposition", "attachment; filename=combined_spreadsheets.csv") print select('spreadsheet from combined_spreadsheets where time = (select max(time) from combined_spreadsheets)')[0]['spreadsheet']
return results ''' all_quotes = re.compile(r'"([^"]*)[,.?!]"') for value in all_quotes.findall(text): quote = { 'name': '', 'quote': value } results.append(quote) print "a quote" + value ''' attach('feedzilla') articles = select('* from feedzilla.swdata limit 50') for article in articles: quotes = extract_quotes(article['text']) if not quotes: continue for quote in quotes: record = { 'url': article['url'], 'date': article['date'], 'name': quote['name'], 'quote': quote['quote'] } save([], record)
#!/usr/bin/env python from urllib2 import urlopen from lxml.html import fromstring from lxml.etree import tostring from scraperwiki.sqlite import save, attach, select from BeautifulSoup import BeautifulSoup attach('alafia_network') pages = select('* from html') def main(): for page in pages: number = page['page'] #Replace brs with a better delimeter html = page.pop('table').replace('<br />', '|') text = ''.join(BeautifulSoup(html).findAll(text=True)) for row in text.split('=-=-=-=-=-=-=-=-=-=-=-=-=-=-='): for column in row.split('|'): cell = column.split(':') if len(cell) == 2: key, value = cell key = key.encode('ascii', 'ignore').replace(' ', '') if '&#' in key: key = key.split('&#')[0] page[key] = value save(['page'], page, 'microfinance_institutions')
all_quotes = re.compile(r'"([^"]*)[,.?!]"') for value in all_quotes.findall(text): quote = { 'name': '', 'quote': value } #results.append(quote) #print "a quote" + value return results print extract_quotes('"yes," declared Alex.') attach('feedzilla') articles = select('* from feedzilla.swdata limit 20') for article in articles: quotes = extract_quotes(article['text']) if not quotes: continue for quote in quotes: record = { 'url': article['url'], 'date': article['date'], 'name': quote['name'], 'quote': quote['quote'] } print record #save([], record)
#from urllib import urlencode from json import loads from urllib2 import urlopen from scraperwiki.sqlite import attach,select attach('nyc_lobbyist_directory_browser') QUERIES={ "summary":""" SELECT sum(reimbursement) AS "Total Reimbursement" , sum(compensation) AS "Total Compensation" FROM `clients_details` WHERE `detailId`="Total" """ , "completed":""" SELECT count(`href`) AS "completed" FROM `links` WHERE `href`<( SELECT `value_blob` FROM `swvariables` WHERE `name`="previous_href" ) """
from urllib2 import urlopen from scraperwiki import swimport from scraperwiki.sqlite import save,select,attach URL="http://sw.thomaslevine.com/disclosures.csv" attach('open_book_new_york') def load_disclosures(): csv=urlopen(URL) d=swimport('csv2sw').csv2json(csv) save([],d,'disclosures') def join(): disclosures=select('Entity,upper(Entity) as "ENTITY" from disclosures where entity is not null') disclosures_cleaned=[{ "raw":row['Entity'] , "clean":remove_ny(row['ENTITY']).strip() } for row in disclosures] save([],disclosures_cleaned,'disclosures_cleaned') licenses=select('Vendor,upper(Vendor) as "VENDOR" from swdata where Vendor is not null') licenses_cleaned=[{ "raw":row['Vendor'] , "clean":remove_ny(row['VENDOR']).strip() } for row in licenses] save([],licenses_cleaned,'licenses_cleaned') def remove_ny(string): for ny in ("N.Y.S.","NYS","NEW YORK STATE","NEW YORK","N.Y.","NY"): string=string.replace(ny,'')