def processLoadedPage(self): """To be called after page is received from the network.""" if self.status == 200: # add to parse queue & save if needed dummy, ext = os.path.splitext(self.url) ext = ext.lower() if self.header.gettype() in config.pagemimetypes: pageoutput.save(self) pageparsequeue.add(self) elif ext in config.documentsuffixes or self.header.gettype() in config.documentmimetypes: print "*** docsave", len(self.body) documentoutput.save(self) pagefetchqueue.add(parenturl(self.url)) elif self.status == 204: # No Content return None elif self.status in [300, # Multiple Choices 301, 302, # Found - not really sure how to handle this ]: # XXX this could be handled much smarter return None elif self.status in [400, 401, # unauthorized 403, # access denied 404, # not fond 405, # method no supported 406, 408, 410, # Gone 412, 414, # Request URI Too Large 423, # Locked ]: return None elif self.status in [500, # internal error 501, 502, # Bad Gateway 503, 504, # Origin Server Timeout 505, 506, 510, 0, # No Header was send ]: return None else: print vars(self.header) log.exception("unknown response code %r" % self.status) raise RuntimeError, "unknown response code %r" % self.status
links = extractlinks.parseForUrlsInHtml(page.body, page.url) except Exception, msg: log.exception(msg) return None added = 0 for link in links: if not dupelist.seen(link) and 'osdn.safaribooksonline.com' not in link: dummmy, ext = os.path.splitext(link) ext = ext.lower() if ext in config.documentsuffixes: documentfetchqueue.add(link) added += 1 else: if ext not in config.nonpagesuffixes: pagefetchqueue.add(link) added += 1 #log.debug('%d links (%d suitable for queue) in %.3fs extracted form %r' % (len(links), added, (time.time() - t), page.url)) def main(): while len(pagefetchqueue) + len(pageparsequeue) + len(documentfetchqueue) > 0: if documentfetchqueue: print range(config.paralell_downloads - status.active_downloads - 2) for i in range(config.paralell_downloads - status.active_downloads - 2): fetch_a_document() log.info('dupe: %d, fetchqueue: %d, parsequeue: %d, docqueue: %d, active: %d, ok: %d, ko: %d' %(len(dupelist), len(pagefetchqueue), len(pageparsequeue), len(documentfetchqueue), status.active_downloads, status.download_successes, status.download_failures))