def __init__(self, engine): self.sites = {} self.spidermw = SpiderMiddlewareManager() self.itemproc = load_object(settings['ITEM_PROCESSOR'])() self.concurrent_items = settings.getint('CONCURRENT_ITEMS') self.engine = engine
class Scraper(object): def __init__(self, engine): self.sites = {} self.spidermw = SpiderMiddlewareManager() self.itemproc = load_object(settings['ITEM_PROCESSOR'])() self.concurrent_items = settings.getint('CONCURRENT_ITEMS') self.engine = engine def open_spider(self, spider): """Open the given spider for scraping and allocate resources for it""" if spider in self.sites: raise RuntimeError('Scraper spider already opened: %s' % spider) self.sites[spider] = SpiderInfo() self.itemproc.open_spider(spider) def close_spider(self, spider): """Close a spider being scraped and release its resources""" if spider not in self.sites: raise RuntimeError('Scraper spider already closed: %s' % spider) self.sites.pop(spider) self.itemproc.close_spider(spider) def is_idle(self): """Return True if there isn't any more spiders to process""" return not self.sites def enqueue_scrape(self, response, request, spider): site = self.sites[spider] dfd = site.add_response_request(response, request) # FIXME: this can't be called here because the stats spider may be # already closed #stats.max_value('scraper/max_active_size', site.active_size, \ # spider=spider) def finish_scraping(_): site.finish_response(response) self._scrape_next(spider, site) return _ dfd.addBoth(finish_scraping) dfd.addErrback(log.err, 'Scraper bug processing %s' % request, \ spider=spider) self._scrape_next(spider, site) return dfd def _scrape_next(self, spider, site): while site.queue: response, request, deferred = site.next_response_request_deferred() self._scrape(response, request, spider).chainDeferred(deferred) def _scrape(self, response, request, spider): """Handle the downloaded response or failure trough the spider callback/errback""" assert isinstance(response, (Response, Failure)) dfd = self._scrape2(response, request, spider) # returns spiders processed output dfd.addErrback(self.handle_spider_error, request, spider) dfd.addCallback(self.handle_spider_output, request, response, spider) return dfd def _scrape2(self, request_result, request, spider): """Handle the diferent cases of request's result been a Response or a Failure""" if not isinstance(request_result, Failure): return self.spidermw.scrape_response(self.call_spider, \ request_result, request, spider) else: # FIXME: don't ignore errors in spider middleware dfd = self.call_spider(request_result, request, spider) return dfd.addErrback(self._check_propagated_failure, \ request_result, request, spider) def call_spider(self, result, request, spider): defer_result(result).chainDeferred(request.deferred) return request.deferred.addCallback(iterate_spider_output) def handle_spider_error(self, _failure, request, spider, propagated_failure=None): referer = request.headers.get('Referer', None) msg = "Spider exception caught while processing <%s> (referer: <%s>): %s" % \ (request.url, referer, _failure) log.msg(msg, log.ERROR, spider=spider) stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \ spider=spider) def handle_spider_output(self, result, request, response, spider): if not result: return defer_succeed(None) dfd = parallel(iter(result), self.concurrent_items, self._process_spidermw_output, request, response, spider) return dfd def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ # TODO: keep closing state internally instead of checking engine if spider in self.engine.closing: return elif isinstance(output, Request): send_catch_log(signal=signals.request_received, request=output, \ spider=spider) self.engine.crawl(request=output, spider=spider) elif isinstance(output, BaseItem): log.msg("Scraped %s in <%s>" % (output, request.url), level=log.DEBUG, \ spider=spider) send_catch_log(signal=signals.item_scraped, sender=self.__class__, \ item=output, spider=spider, response=response) self.sites[spider].itemproc_size += 1 # FIXME: this can't be called here because the stats spider may be # already closed #stats.max_value('scraper/max_itemproc_size', \ # self.sites[spider].itemproc_size, spider=spider) dfd = self.itemproc.process_item(output, spider) dfd.addBoth(self._itemproc_finished, output, spider) return dfd elif output is None: pass else: log.msg("Spider must return Request, BaseItem or None, got %r in %s" % \ (type(output).__name__, request), log.ERROR, spider=spider) def _check_propagated_failure(self, spider_failure, propagated_failure, request, spider): """Log and silence the bugs raised outside of spiders, but still allow spiders to be notified about general failures while downloading spider generated requests """ # ignored requests are commonly propagated exceptions safes to be silenced if isinstance(spider_failure.value, IgnoreRequest): return elif spider_failure is propagated_failure: log.err(spider_failure, 'Unhandled error propagated to spider', \ spider=spider) return # stop propagating this error else: return spider_failure # exceptions raised in the spider code def _itemproc_finished(self, output, item, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.sites[spider].itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): log.msg("Dropped %s - %s" % (item, str(ex)), level=log.WARNING, spider=spider) send_catch_log(signal=signals.item_dropped, sender=self.__class__, \ item=item, spider=spider, exception=output.value) else: log.msg('Error processing %s - %s' % (item, output), \ log.ERROR, spider=spider) else: log.msg("Passed %s" % item, log.INFO, spider=spider) send_catch_log(signal=signals.item_passed, sender=self.__class__, \ item=item, spider=spider, output=output)