from scraperwiki.utils import swimport dbget=swimport('dbgetpy') #Save urls=["https://scraperwiki.com/scrapers/dbgetpy","https://scraperwiki.com"] for url in urls: dbget.save_page(url) #Retrieve html=dbget.get_page("https://scraperwiki.com/scrapers/dbgetpy") print htmlfrom scraperwiki.utils import swimport dbget=swimport('dbgetpy') #Save urls=["https://scraperwiki.com/scrapers/dbgetpy","https://scraperwiki.com"] for url in urls: dbget.save_page(url) #Retrieve html=dbget.get_page("https://scraperwiki.com/scrapers/dbgetpy") print html
#!/usr/bin/env python2 from urllib2 import urlopen, build_opener,HTTPCookieProcessor from lxml.html import fromstring from httplib import BadStatusLine from json import loads try: from htmltable2matrix import htmltable2matrix except ImportError: from scraperwiki.utils import swimport htmltable2matrix=swimport('htmltable2matrix').htmltable2matrix from scraperwiki.sqlite import save,get_var,save_var def main(): if get_var('skip')==None: save_var('skip',0) routesTable=getroutes() for row in routesTable: if row['key'][0:2]!=row['key'][2:4]: get_route_schedules(row['id'],row['key']) #------------------------------------------ def get_route_schedules(routeId,route): #Check that it's not a route within one city assert route[0:2]!=route[2:4] xml,theurl=grab(route) save(['routeId','url'],{ "routeId":routeId
#Load Chainsaw from scraperwiki.utils import swimport chainsaw = swimport('chainsaw') #Load other stuff from urllib2 import urlopen from lxml.html import fromstring xml = fromstring(urlopen('http://scraperwiki.com').read()) def example_htmltable2matrix(): table = xml.xpath('//table')[0] print chainsaw.htmltable2matrix(table) def main(): example_htmltable2matrix() main() #Load Chainsaw from scraperwiki.utils import swimport chainsaw = swimport('chainsaw') #Load other stuff from urllib2 import urlopen from lxml.html import fromstring xml = fromstring(urlopen('http://scraperwiki.com').read())
#Load Chainsaw from scraperwiki.utils import swimport chainsaw=swimport('chainsaw') #Load other stuff from urllib2 import urlopen from lxml.html import fromstring xml=fromstring(urlopen('http://scraperwiki.com').read()) def example_htmltable2matrix(): table=xml.xpath('//table')[0] print chainsaw.htmltable2matrix(table) def main(): example_htmltable2matrix() main()#Load Chainsaw from scraperwiki.utils import swimport chainsaw=swimport('chainsaw') #Load other stuff from urllib2 import urlopen from lxml.html import fromstring xml=fromstring(urlopen('http://scraperwiki.com').read()) def example_htmltable2matrix(): table=xml.xpath('//table')[0] print chainsaw.htmltable2matrix(table) def main(): example_htmltable2matrix()
#!/usr/bin/env python2 from urllib2 import urlopen, build_opener, HTTPCookieProcessor from lxml.html import fromstring from httplib import BadStatusLine from json import loads try: from htmltable2matrix import htmltable2matrix except ImportError: from scraperwiki.utils import swimport htmltable2matrix = swimport('htmltable2matrix').htmltable2matrix from scraperwiki.sqlite import save, get_var, save_var def main(): if get_var('skip') == None: save_var('skip', 0) routesTable = getroutes() for row in routesTable: if row['key'][0:2] != row['key'][2:4]: get_route_schedules(row['id'], row['key']) #------------------------------------------ def get_route_schedules(routeId, route): #Check that it's not a route within one city assert route[0:2] != route[2:4]