sourcescraper = "city-of-ottawa-development-applications" limit = 15 print '<?xml version="1.0" encoding="UTF-8" ?>\n' print ' <rss version="2.0">\n' print ' <channel>\n' print ' <title>City of Ottawa Development Applications</title>\n' print ' <description>Unofficial RSS feed for City of Ottawa Development Applications</description>\n' print ' <link>http://scraperwiki.com/scrapers/%s/</link>\n' % sourcescraper print ' <lastBuildDate>%s</lastBuildDate>\n' % strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) print ' <pubDate>%s</pubDate>\n' % strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) # rows current = 0 for row in sorted(getData(sourcescraper, 0, 0), reverse=True, key=lambda row: strptime(row.get('Status_Date'), '%b %d, %Y')): current += 1 if current >= limit: break title = row.get('Application_Number') + ' - ' + row.get('Primary_Address') + ' - ' + row.get('Review_Status') print ' <item>\n' print ' <title>%s</title>\n' % title print ' <description>%s</description>\n' % row.get('Description') print ' <link>%s</link>\n' % escape(row.get('Application_Link')) print ' <guid>%s</guid>\n' % row.get('Application_Number') print ' <pubDate>%s</pubDate>\n' % strftime("%a, %d %b %Y %H:%M:%S +0000", strptime(row.get('Status_Date'), '%b %d, %Y')) print ' </item>\n' print ' </channel>\n'
limit = 100 #Number of records offset = 0 keys = getKeys(sourcescraper) #keys.sort() # alphabetically print '<h2>Some data from scraper: %s (%d columns)</h2>' % (sourcescraper, len(keys)) print '<table border="4" style="border-collapse:collapse;">' print "<tr>", #This scetion interprets column headings for key in keys: print "<th>%s</th>" % key, print "</tr>" for row in getData(sourcescraper, limit, offset): #This section interpret rows print "<tr>", for key in keys: print "<td>%s</td>" % row.get(key), print "</tr>" print "</table>" from scraperwiki.apiwrapper import getKeys, getData, getDataByDate, getDataByLocation sourcescraper = "pl" #Data table limit = 100 #Number of records offset = 0 keys = getKeys(sourcescraper)
sourcescraper = "dun-laoghaire-rathdown-county-council-planning-app" keys = getKeys(sourcescraper) keys.sort() # alphabetically print '<h2>Data from scraper: %s</h2>' % sourcescraper print '<table border="1" style="border-collapse:collapse;">' # column headings print "<tr>", for key in keys: print "<th>%s</th>" % key, print "</tr>" # rows for row in getData(sourcescraper): print "<tr>", for key in keys: print "<td>%s</td>" % row.get(key), print "</tr>" print "</table>" ######################################### # Simple table of values from one scraper ######################################### from scraperwiki.apiwrapper import getKeys, getData, getDataByDate, getDataByLocation sourcescraper = "dun-laoghaire-rathdown-county-council-planning-app" keys = getKeys(sourcescraper) keys.sort() # alphabetically
# USP - Disciplinas # # Informações coletadas do sistema JupiterWeb # # # # # ########################################################################################################## sourcescraper = "usp-departamentos" #listagem de todos os departamentos da USP de unidades que possuem Comissão de Graduação import scraperwiki import re from BeautifulSoup import BeautifulSoup from scraperwiki.apiwrapper import getKeys, getData, getDataByDate, getDataByLocation from operator import itemgetter, attrgetter keys = getKeys(sourcescraper) departamentos = getData(sourcescraper) total_disciplinas = 0 for departamento in departamentos: #####################Coletando disciplinas de um departamento ##################### codigoUnidade = departamento.get(keys[2]) print codigoUnidade siglaDepartamento = departamento.get(keys[1]) print siglaDepartamento nomeDepartamento = departamento.get(keys[3]) ##link de busca starting_pagina_url = 'http://sistemas2.usp.br/jupiterweb/jupDisciplinaLista?codcg=' + codigoUnidade + '&pfxdisval=' + siglaDepartamento + '&tipo=D' ##pegando o html da página html_pagina = scraperwiki.scrape(starting_pagina_url) ##transformando num objeto do BeautifulSoup paginaSoup = BeautifulSoup(html_pagina) ##usar o BeautifulSoup para pegar a lista de docentes e seus códigos
######################################### # Simple table of values from one scraper ######################################### from scraperwiki.apiwrapper import getKeys, getData, getDataByDate, getDataByLocation sourcescraper = "environment-agency-river-level-monitors" # a bigger limit will get a more representative sample. # or set a high offset to get a different sample limit = 200 offset = 0 m = list(getData(sourcescraper, limit, offset)) def ks(a): v = a.get('proportion') if not v: return 0.0 return -float(v) m.sort(key=ks) for a in m[:200]: if not a.get('low'): continue low = float(a.get('low')) high = float(a.get('high')) level = float(a.get('level')) proportion= float(a.get('proportion')) letter = level >0.5 and "W" or "L" img = "http://www.ocean30.us/idx/prop-list.html?page=1" % (int(proportion*100))
limit = 15 print '<?xml version="1.0" encoding="UTF-8" ?>\n' print ' <rss version="2.0">\n' print ' <channel>\n' print ' <title>National Policing Improvement Agency Press Releases</title>\n' print ' <description>RSS feed for Press Releases from the National Policing Improvement Agency</description>\n' print ' <link>http://scraperwiki.com/scrapers/%s/</link>\n' % sourcescraper print ' <lastBuildDate>%s</lastBuildDate>\n' % strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) print ' <pubDate>%s</pubDate>\n' % strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) # rows current = 0 for row in sorted(getData(sourcescraper, 0, 0), reverse=True, key=lambda row: strptime(row.get('date'), '%Y-%m-%d')): current += 1 if current >= limit: break print ' <item>\n' print ' <title>%s</title>\n' % row.get('title') print ' <link>%s</link>\n' % escape(row.get('url')) print ' <pubDate>%s</pubDate>\n' % strftime("%a, %d %b %Y %H:%M:%S +0000", strptime(row.get('date'), '%Y-%m-%d')) print ' </item>\n' print ' </channel>\n' print '</rss> \n' # Here we're going to build an RSS feed from the NPIA press release data # this view is based on the Lincoln Council Committee RSS view from scraperwiki.apiwrapper import getKeys, getData, getDataByDate, getDataByLocation
from scraperwiki.apiwrapper import getKeys, getData, getDataByDate, getDataByLocation import urllib import csv import datetime days4 = datetime.date.today() - datetime.timedelta(2) days8 = datetime.date.today() - datetime.timedelta(4) sourcescraper = "takeover-panel-info" # get from the primary data limit = 90 offset = 0 rows = list(getData(sourcescraper, limit, offset)) # join the human generated data from the spreadsheet # couldn't download from google directly!!! so had to cache the result #url = "http://spreadsheets.google.com/feeds/download/spreadsheets/Export?key=0Aju6C2vCJrTNdEJkRkNGR1V4ZHJzWHRGR2x6cmEzN1E&exportFormat=csv" url = 'http://seagrass.goatchurch.org.uk/~julian/takeoverpanel.csv' url = 'https://spreadsheets.google.com/pub?key=0Aju6C2vCJrTNdEJkRkNGR1V4ZHJzWHRGR2x6cmEzN1E&output=csv' fin = urllib.urlopen(url) hdata = {} lines = fin.readlines() clist = list(csv.reader(lines)) headers = clist.pop(0) for row in clist: hrow = dict(zip(headers, row)) stockcode = hrow.get('OffereeStockCode') hrow['articles'] = []
offset = 0 keys = getKeys(sourcescraper) keys.sort() # alphabetically print '<h2>Some data from scraper: %s (%d columns)</h2>' % (sourcescraper, len(keys)) print '<table border="1" style="border-collapse:collapse;">' # column headings print "<tr>", for key in keys: print "<th>%s</th>" % key, print "</tr>" # rows for row in getData(sourcescraper, limit, offset): print "<tr>", for key in keys: print "<td>%s</td>" % row.get(key), print "</tr>" print "</table>" ######################################### # Simple table of values from one scraper ######################################### from scraperwiki.apiwrapper import getKeys, getData, getDataByDate, getDataByLocation sourcescraper = "cost_of_gov_websites" limit = 20 offset = 0
limit = 100#Number of records offset = 0 keys = getKeys(sourcescraper) #keys.sort() # alphabetically print '<h2>Some data from scraper: %s (%d columns)</h2>' % (sourcescraper, len(keys)) print '<table border="4" style="border-collapse:collapse;">' print "<tr>",#This scetion interprets column headings for key in keys: print "<th>%s</th>" % key, print "</tr>" for row in getData(sourcescraper, limit, offset):#This section interpret rows print "<tr>", for key in keys: print "<td>%s</td>" % row.get(key), print "</tr>" print "</table>" from scraperwiki.apiwrapper import getKeys, getData, getDataByDate, getDataByLocation sourcescraper = "pl" #Data table limit = 100#Number of records offset = 0 keys = getKeys(sourcescraper)
######################################### # Simple table of values from one scraper ######################################### from scraperwiki.apiwrapper import getKeys, getData, getDataByDate, getDataByLocation sourcescraper = "environment-agency-river-level-monitors" # a bigger limit will get a more representative sample. # or set a high offset to get a different sample limit = 200 offset = 0 m = list(getData(sourcescraper, limit, offset)) def ks(a): v = a.get('proportion') if not v: return 0.0 return -float(v) m.sort(key=ks) for a in m[:200]: if not a.get('low'): continue low = float(a.get('low')) high = float(a.get('high')) level = float(a.get('level')) proportion = float(a.get('proportion')) letter = level > 0.5 and "W" or "L"
sourcescraper = "lisburn-city-council-minutes" limit = 15 print '<?xml version="1.0" encoding="UTF-8" ?>\n' print ' <rss version="2.0">\n' print " <channel>\n" print " <title>Lisburn City Council minutes</title>\n" print " <description>RSS feed for the latest public minutes published by Lisburn City Council</description>\n" print " <link>http://scraperwiki.com/scrapers/%s/</link>\n" % sourcescraper print " <lastBuildDate>%s</lastBuildDate>\n" % strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) print " <pubDate>%s</pubDate>\n" % strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) # rows current = 0 for row in sorted(getData(sourcescraper, 0, 0), reverse=True, key=lambda row: strptime(row.get("Date"), "%d %B %Y")): current += 1 if current >= limit: break print " <item>\n" print " <title>%s</title>\n" % row.get("Title") print " <description>%s</description>\n" % row.get("Committee") print " <link>%s</link>\n" % escape(row.get("Link")) print " <pubDate>%s</pubDate>\n" % strftime( "%a, %d %b %Y %H:%M:%S +0000", strptime(row.get("Date"), "%d %B %Y") ) print " </item>\n" print " </channel>\n" print "</rss> \n" # Creating an RSS feed from the 'Lisburn City Council minutes' scraper
limit = 200 offset = 0 cin = { } cco = { } memberkeys = [ 'advisors', 'banks', 'contractor', 'members', 'private_advisors', 'private_contractors' ] pfis = [ ] memberquery = os.getenv('URLQUERY') if memberquery: memberquery = urllib.unquote_plus(memberquery[2:]) print memberquery for i, row in enumerate(getData(sourcescraper, limit, offset)): if i <= 1: print row members = set() for k in memberkeys: try: members = members.union(eval(row.get(k, '[]'))) except SyntaxError: pass except NameError: pass value = float(row.get('Capital_Value', 0)) members = sorted(members)
limit = 15 print '<?xml version="1.0" encoding="UTF-8" ?>\n' print ' <rss version="2.0">\n' print ' <channel>\n' print ' <title>City of Lincoln Council Committee Updates</title>\n' print ' <description>RSS feed for Monthly Committee Updates from the City of Lincoln Council</description>\n' print ' <link>http://scraperwiki.com/scrapers/%s/</link>\n' % sourcescraper print ' <lastBuildDate>%s</lastBuildDate>\n' % strftime( "%a, %d %b %Y %H:%M:%S +0000", gmtime()) print ' <pubDate>%s</pubDate>\n' % strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) # rows current = 0 for row in sorted(getData(sourcescraper, 0, 0), reverse=True, key=lambda row: strptime(row.get('Date'), '%d %B %Y')): current += 1 if current >= limit: break print ' <item>\n' print ' <title>%s</title>\n' % row.get('Title') print ' <description>%s</description>\n' % row.get('Event') print ' <link>%s</link>\n' % escape(row.get('Link')) print ' <pubDate>%s</pubDate>\n' % strftime( "%a, %d %b %Y %H:%M:%S +0000", strptime(row.get('Date'), '%d %B %Y')) print ' </item>\n' print ' </channel>\n'
print '</br>' print '<tr><font color=red face="verdana" size=5>POINTS TABLE</font></th></tr>' print '</br>' print '</br>' print '<a href="http://scraperwiki.com/views/jan11-match-result-premier-league/full/">January 2011 Results</a>' print '</br>' print '</br>' print '<table border="5" bgcolor ="CCCCFF" cellpadding="15" style="border-collapse:collapse;">' print "<tr>",#column headings section for key in keys: print "<th><bgcolor=003366>%s</th>" % key, print "</tr>" for row in getData(sourcescraper, limit, offset):#row headings section print "<tr>", for key in keys: print "<td>%s</td>" % row.get(key), print "**" print "</tr>" print "</table>" print '<h4>Result from scraper: %s</h4>' % (sourcescraper) print '<h5>Creator:1019053-Yomal A. Mudalige</h5>' ################## #Note from creator(Yomal Mudalige)- I have been tried to change column orders, however still it is not finalized. Hence I added prefix 'A', 'B'.....etc. Thanks #Reference for scraper- Scraperwiki Tutorial 3 and Python power!: the comprehensive guide By Matt Telles,pp.333. ################## from scraperwiki.apiwrapper import getKeys, getData, getDataByDate, getDataByLocation
print '</br>' print '<tr><font color=red face="verdana" size=5>POINTS TABLE</font></th></tr>' print '</br>' print '</br>' print '<a href="http://scraperwiki.com/views/jan11-match-result-premier-league/full/">January 2011 Results</a>' print '</br>' print '</br>' print '<table border="5" bgcolor ="CCCCFF" cellpadding="15" style="border-collapse:collapse;">' print "<tr>", #column headings section for key in keys: print "<th><bgcolor=003366>%s</th>" % key, print "</tr>" for row in getData(sourcescraper, limit, offset): #row headings section print "<tr>", for key in keys: print "<td>%s</td>" % row.get(key), print "**" print "</tr>" print "</table>" print '<h4>Result from scraper: %s</h4>' % (sourcescraper) print '<h5>Creator:1019053-Yomal A. Mudalige</h5>' ################## #Note from creator(Yomal Mudalige)- I have been tried to change column orders, however still it is not finalized. Hence I added prefix 'A', 'B'.....etc. Thanks #Reference for scraper- Scraperwiki Tutorial 3 and Python power!: the comprehensive guide By Matt Telles,pp.333. ################## from scraperwiki.apiwrapper import getKeys, getData, getDataByDate, getDataByLocation
######################################### # Simple table of values from one scraper ######################################### from scraperwiki.apiwrapper import getKeys, getData, getDataByDate, getDataByLocation import re sourcescraper = "swale-democratic-services-events-diary" limit = 30 offset = 0 keys = getKeys(sourcescraper) data = getData(sourcescraper, 1, offset) #res ure = re.compile(".*(http.*uid=[0-9]*)", re.DOTALL) nre = re.compile(".*strong.(.*strong)", re.DOTALL) allre = re.compile(".*", re.DOTALL) #meta for row in getData(sourcescraper, limit, offset): updated = "%s%s" % (row.get('datetime'), "Z") break print """<?xml version="1.0" encoding="utf-8"?> <feed xmlns='http://www.w3.org/2005/Atom'> <title type='text'>Swale Democratic Services Calendar</title> <subtitle type='html'>Open Swale</subtitle> <updated>%s</updated> <id>http://scraperwikiviews.com/run/feed/?</id>
from scraperwiki.apiwrapper import getKeys, getData, getDataByDate, getDataByLocation import urllib import csv import datetime days4 = datetime.date.today() - datetime.timedelta(2) days8 = datetime.date.today() - datetime.timedelta(4) sourcescraper = "takeover-panel-info" # get from the primary data limit = 90 offset = 0 rows = list(getData(sourcescraper, limit, offset)) # join the human generated data from the spreadsheet # couldn't download from google directly!!! so had to cache the result #url = "http://spreadsheets.google.com/feeds/download/spreadsheets/Export?key=0Aju6C2vCJrTNdEJkRkNGR1V4ZHJzWHRGR2x6cmEzN1E&exportFormat=csv" url = 'http://seagrass.goatchurch.org.uk/~julian/takeoverpanel.csv' url = 'https://spreadsheets.google.com/pub?key=0Aju6C2vCJrTNdEJkRkNGR1V4ZHJzWHRGR2x6cmEzN1E&output=csv' fin = urllib.urlopen(url) hdata = { } lines = fin.readlines() clist = list(csv.reader(lines)) headers = clist.pop(0) for row in clist: hrow = dict(zip(headers, row)) stockcode = hrow.get('OffereeStockCode')