def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" slot = self.slots[spider] if slot.closing: return slot.closing log.msg("Closing spider (%s)" % reason, spider=spider) dfd = slot.close() dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: slot.scheduler.close(reason)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: send_catch_log_deferred(signal=signals.spider_closed, \ spider=spider, reason=reason)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider)) dfd.addBoth(lambda _: self.slots.pop(spider)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd
def _itemproc_finished(self, output, item, response, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.slots[spider].itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): log.msg(log.formatter.dropped(item, ex, response, spider), level=log.WARNING, spider=spider) return send_catch_log_deferred( signal=signals.item_dropped, item=item, spider=spider, exception=output.value ) else: log.err(output, "Error processing %s" % item, spider=spider) else: log.msg(log.formatter.scraped(output, response, spider), log.DEBUG, spider=spider) return send_catch_log_deferred(signal=signals.item_scraped, item=output, response=response, spider=spider)
def _recv(self): self.running = True log.msg('recv begin: %s' % self.chnls, log.DEBUG) while self.running: try: chnl, msg = API.queue_bpop(self.chnls, timeout=5) except redis.exceptions.ConnectionError, e: send_catch_log_deferred(signal=signals.ERROR, sender=self) self.stop() else: if not chnl or not msg: continue send_catch_log_deferred(signal=signals.RECV, sender=self, message=(chnl, msg))
def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" slot = self.slots[spider] if slot.closing: return slot.closing log.msg("Closing spider (%s)" % reason, spider=spider) dfd = slot.close() dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: slot.scheduler.close(reason)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: send_catch_log_deferred(signal=signals.spider_closed, \ spider=spider, reason=reason)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider)) dfd.addBoth(lambda _: self.slots.pop(spider)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd
def _itemproc_finished(self, output, item, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.sites[spider].itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): log.msg(log.formatter.dropped(item, ex, spider), \ level=log.WARNING, spider=spider) return send_catch_log_deferred(signal=signals.item_dropped, \ item=item, spider=spider, exception=output.value) else: log.err(output, 'Error processing %s' % item, spider=spider) else: log.msg(log.formatter.passed(output, spider), log.INFO, spider=spider) return send_catch_log_deferred(signal=signals.item_passed, \ item=item, spider=spider, output=output)
def open_spider(self, spider): assert self.has_capacity(), "No free spider slots when opening %r" % spider.name log.msg("Spider opened", spider=spider) yield self.scheduler.open_spider(spider) self.downloader.open_spider(spider) yield self.scraper.open_spider(spider) stats.open_spider(spider) yield send_catch_log_deferred(signals.spider_opened, spider=spider) self.next_request(spider)
def _itemproc_finished(self, output, item, response, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.slots[spider].itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): log.msg(log.formatter.dropped(item, ex, response, spider), \ level=log.WARNING, spider=spider) return send_catch_log_deferred(signal=signals.item_dropped, \ item=item, spider=spider, exception=output.value) else: log.err(output, 'Error processing %s' % item, spider=spider) else: log.msg(log.formatter.scraped(output, response, spider), \ log.DEBUG, spider=spider) return send_catch_log_deferred(signal=signals.item_scraped, \ item=output, response=response, spider=spider)
def open_spider(self, spider): assert self.has_capacity(), "No free spider slots when opening %r" % \ spider.name log.msg("Spider opened", spider=spider) yield self.scheduler.open_spider(spider) self.downloader.open_spider(spider) yield self.scraper.open_spider(spider) stats.open_spider(spider) yield send_catch_log_deferred(signals.spider_opened, spider=spider) self.next_request(spider)
def _itemproc_finished(self, output, item, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.sites[spider].itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): log.msg(log.formatter.dropped(item, ex, spider), \ level=log.WARNING, spider=spider) return send_catch_log_deferred(signal=signals.item_dropped, \ item=item, spider=spider, exception=output.value) else: log.err(output, 'Error processing %s' % item, spider=spider) else: log.msg(log.formatter.passed(output, spider), log.INFO, spider=spider) # TODO: remove item_passed 'output' parameter for Scrapy 0.12 return send_catch_log_deferred(signal=signals.item_passed, \ item=output, spider=spider, output=output, original_item=item)
def _recv(self): self.running = True log.msg('recv begin: %s'%self.chnls,log.DEBUG) while self.running: try: chnl,msg = API.queue_bpop(self.chnls, timeout=5) except redis.exceptions.ConnectionError, e: send_catch_log_deferred( signal=signals.ERROR, sender=self ) self.stop() else: if not chnl or not msg: continue send_catch_log_deferred( signal=signals.RECV, sender=self, message=(chnl,msg) )
def open_spider(self, spider, start_requests=None, close_if_idle=True): assert self.has_capacity(), "No free spider slots when opening %r" % spider.name log.msg("Spider opened", spider=spider) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_settings(self.settings) slot = Slot(start_requests or (), close_if_idle, nextcall, scheduler) self.slots[spider] = slot yield scheduler.open(spider) yield self.scraper.open_spider(spider) stats.open_spider(spider) yield send_catch_log_deferred(signals.spider_opened, spider=spider) slot.nextcall.schedule()
def send_catch_log_deferred(self, signal, **kwargs): """ Like :meth:`send_catch_log` but supports returning :class:`~twisted.internet.defer.Deferred` objects from signal handlers. Returns a Deferred that gets fired once all signal handlers deferreds were fired. Send a signal, catch exceptions and log them. The keyword arguments are passed to the signal handlers (connected through the :meth:`connect` method). """ kwargs.setdefault("sender", self.sender) return _signal.send_catch_log_deferred(signal, **kwargs)
def _finish_closing_spider(self, spider): """This function is called after the spider has been closed""" reason = self.closing.pop(spider, "finished") call = self._next_request_calls.pop(spider, None) if call and call.active(): call.cancel() dfd = send_catch_log_deferred(signal=signals.spider_closed, spider=spider, reason=reason) dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason)) dfd.addErrback(log.err, "Unhandled error in stats.close_spider()", spider=spider) dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider)) dfd.addBoth(lambda _: self.closing_dfds.pop(spider).callback(spider)) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd
def open_spider(self, spider, start_requests=None, close_if_idle=True): assert self.has_capacity(), "No free spider slots when opening %r" % \ spider.name log.msg("Spider opened", spider=spider) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_settings(self.settings) slot = Slot(start_requests or (), close_if_idle, nextcall, scheduler) self.slots[spider] = slot yield scheduler.open(spider) yield self.scraper.open_spider(spider) stats.open_spider(spider) yield send_catch_log_deferred(signals.spider_opened, spider=spider) slot.nextcall.schedule()
def send_catch_log_deferred(self, signal, **kwargs): """ Like :meth:`send_catch_log` but supports returning `deferreds`_ from signal handlers. Returns a Deferred that gets fired once all signal handlers deferreds were fired. Send a signal, catch exceptions and log them. The keyword arguments are passed to the signal handlers (connected through the :meth:`connect` method). .. _deferreds: http://twistedmatrix.com/documents/current/core/howto/defer.html """ kwargs.setdefault('sender', self.sender) return _signal.send_catch_log_deferred(signal, **kwargs)
def send_catch_log_deferred(self, signal, **kwargs): """ Like :meth:`send_catch_log` but supports returning `deferreds`_ from signal handlers. Returns a Deferred that gets fired once all signal handlers deferreds were fired. Send a signal, catch exceptions and log them. The keyword arguments are passed to the signal handlers (connected through the :meth:`connect` method). .. _deferreds: http://twistedmatrix.com/documents/current/core/howto/defer.html """ kwargs.setdefault('sender', self.sender) return _signal.send_catch_log_deferred(signal, **kwargs)
def _finish_closing_spider(self, spider): """This function is called after the spider has been closed""" reason = self.closing.pop(spider, 'finished') call = self._next_request_calls.pop(spider, None) if call and call.active(): call.cancel() dfd = send_catch_log_deferred(signal=signals.spider_closed, \ spider=spider, reason=reason) dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason)) dfd.addErrback(log.err, "Unhandled error in stats.close_spider()", spider=spider) dfd.addBoth( lambda _: log.msg("Spider closed (%s)" % reason, spider=spider)) dfd.addBoth(lambda _: self.closing_dfds.pop(spider).callback(spider)) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd
def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ if isinstance(output, Request): send_catch_log(signal=signals.request_received, request=output, \ spider=spider) self.engine.crawl(request=output, spider=spider) elif isinstance(output, BaseItem): log.msg(log.formatter.scraped(output, request, response, spider), \ level=log.DEBUG, spider=spider) self.sites[spider].itemproc_size += 1 dfd = send_catch_log_deferred(signal=signals.item_scraped, \ item=output, spider=spider, response=response) dfd.addBoth(lambda _: self.itemproc.process_item(output, spider)) dfd.addBoth(self._itemproc_finished, output, spider) return dfd elif output is None: pass else: log.msg("Spider must return Request, BaseItem or None, got %r in %s" % \ (type(output).__name__, request), log.ERROR, spider=spider)
def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ if isinstance(output, Request): send_catch_log(signal=signals.request_received, request=output, \ spider=spider) self.engine.crawl(request=output, spider=spider) elif isinstance(output, BaseItem): log.msg(log.formatter.scraped(output, request, response, spider), \ level=log.DEBUG, spider=spider) self.sites[spider].itemproc_size += 1 dfd = send_catch_log_deferred(signal=signals.item_scraped, \ item=output, spider=spider, response=response) dfd.addBoth(lambda _: self.itemproc.process_item(output, spider)) dfd.addBoth(self._itemproc_finished, output, spider) return dfd elif output is None: pass else: log.msg("Spider must return Request, BaseItem or None, got %r in %s" % \ (type(output).__name__, request), log.ERROR, spider=spider)
def _finish_stopping_engine(self): yield send_catch_log_deferred(signal=signals.engine_stopped) yield stats.engine_stopped()
def _get_result(self, signal, *a, **kw): return send_catch_log_deferred(signal, *a, **kw)
def start(self): """Start the execution engine""" assert not self.running, "Engine already running" self.start_time = time() yield send_catch_log_deferred(signal=signals.engine_started) self.running = True
def _finish_stopping_engine(self): yield send_catch_log_deferred(signal=signals.engine_stopped) yield stats.engine_stopped()
def send_catch_log_deferred(self, *a, **kw): kw.setdefault('sender', self.sender) return signal.send_catch_log_deferred(*a, **kw)
def _get_result(self, signal, *a, **kw): return send_catch_log_deferred(signal, *a, **kw)
def start(self): """Start the execution engine""" assert not self.running, "Engine already running" self.start_time = time() yield send_catch_log_deferred(signal=signals.engine_started) self.running = True