def site_spider(self, root_url): """ spider every page of the site we can find, report back with links found and their details """ response = o.SpiderResponse(url=root_url) response.pages = [] # starting @ the root spider all the sites we can find w/in # the domain links = self.link_spider(root_url, 1000, True) # all that data is nice and cached so we can reprocess it for link in links + [root_url]: page = o.Page(url=link) page.links = self.get_links(link) page.images = self.get_images(link) try: with srvs_connect(Requester) as c: r = c.urlopen(ro.Request(link)) page.response = r except o.Exception, ex: # problem w/ response = no response print "o.request exception: %s %s" % (link, ex.msg) except Exception, ex: print "request exception: %s %s" % (link, ex)
def site_spider(self, root_url): """ spider every page of the site we can find, report back with links found and their details """ response = o.SpiderResponse(url=root_url) response.pages = [] # starting @ the root spider all the sites we can find w/in # the domain links = self.link_spider(root_url, 1000, True) # all that data is nice and cached so we can reprocess it for link in links + [root_url]: page = o.Page(url=link) page.links = self.get_links(link) page.images = self.get_images(link) try: with srvs_connect(Requester) as c: r = c.urlopen(ro.Request(link)) page.response = r except o.Exception, ex: # problem w/ response = no response print 'o.request exception: %s %s' % (link, ex.msg) except Exception, ex: print 'request exception: %s %s' % (link, ex)
def get_links(self, url): """ returns back the href for all links on page """ url = url.strip() print "get_links: %s" % url # if it's an image forget it if url.lower().endswith(self.not_html_ext): return [] # request the url try: with srvs_connect(Requester) as c: r = c.urlopen(ro.Request(url)) if not r: return [] except o.Exception, ex: raise o.Exception("o.Could not make request: %s %s" % (url, ex))
def get_links(self, url): """ returns back the href for all links on page """ url = url.strip() print 'get_links: %s' % url # if it's an image forget it if url.lower().endswith(self.not_html_ext): return [] # request the url try: with srvs_connect(Requester) as c: r = c.urlopen(ro.Request(url)) if not r: return [] except o.Exception, ex: raise o.Exception('o.Could not make request: %s %s' % (url, ex))
def get_images(self, url): """ returns back the src for all images on page """ url = url.strip() print "get_images: %s" % url # only care to parse html pages if url.lower().endswith(self.not_html_ext): return [] # request the url try: print "get image making request: %s" % url with srvs_connect(Requester) as c: r = c.urlopen(ro.Request(url)) if not r: print "get image no response: %s" % url return [] except o.Exception, ex: print "ex" raise o.Exception("o.Could not make request: %s %s" % (url, ex))
def get_images(self, url): """ returns back the src for all images on page """ url = url.strip() print 'get_images: %s' % url # only care to parse html pages if url.lower().endswith(self.not_html_ext): return [] # request the url try: print 'get image making request: %s' % url with srvs_connect(Requester) as c: r = c.urlopen(ro.Request(url)) if not r: print 'get image no response: %s' % url return [] except o.Exception, ex: print 'ex' raise o.Exception('o.Could not make request: %s %s' % (url, ex))