Example #1
0
    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slots[spider]
        if slot.closing:
            return slot.closing
        log.msg("Closing spider (%s)" % reason, spider=spider)

        dfd = slot.close()

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))

        dfd.addBoth(lambda _: self.slots.pop(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd
Example #2
0
 def _itemproc_finished(self, output, item, response, spider):
     """ItemProcessor finished for the given ``item`` and returned ``output``
     """
     self.slots[spider].itemproc_size -= 1
     if isinstance(output, Failure):
         ex = output.value
         if isinstance(ex, DropItem):
             log.msg(log.formatter.dropped(item, ex, response, spider), level=log.WARNING, spider=spider)
             return send_catch_log_deferred(
                 signal=signals.item_dropped, item=item, spider=spider, exception=output.value
             )
         else:
             log.err(output, "Error processing %s" % item, spider=spider)
     else:
         log.msg(log.formatter.scraped(output, response, spider), log.DEBUG, spider=spider)
         return send_catch_log_deferred(signal=signals.item_scraped, item=output, response=response, spider=spider)
Example #3
0
    def _recv(self):
        self.running = True
        log.msg('recv begin: %s' % self.chnls, log.DEBUG)

        while self.running:
            try:
                chnl, msg = API.queue_bpop(self.chnls, timeout=5)
            except redis.exceptions.ConnectionError, e:
                send_catch_log_deferred(signal=signals.ERROR, sender=self)
                self.stop()
            else:
                if not chnl or not msg: continue

                send_catch_log_deferred(signal=signals.RECV,
                                        sender=self,
                                        message=(chnl, msg))
Example #4
0
File: engine.py Project: tml/scrapy
    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slots[spider]
        if slot.closing:
            return slot.closing
        log.msg("Closing spider (%s)" % reason, spider=spider)

        dfd = slot.close()

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))

        dfd.addBoth(lambda _: self.slots.pop(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd
Example #5
0
 def _itemproc_finished(self, output, item, spider):
     """ItemProcessor finished for the given ``item`` and returned ``output``
     """
     self.sites[spider].itemproc_size -= 1
     if isinstance(output, Failure):
         ex = output.value
         if isinstance(ex, DropItem):
             log.msg(log.formatter.dropped(item, ex, spider), \
                 level=log.WARNING, spider=spider)
             return send_catch_log_deferred(signal=signals.item_dropped, \
                 item=item, spider=spider, exception=output.value)
         else:
             log.err(output, 'Error processing %s' % item, spider=spider)
     else:
         log.msg(log.formatter.passed(output, spider), log.INFO, spider=spider)
         return send_catch_log_deferred(signal=signals.item_passed, \
             item=item, spider=spider, output=output)
Example #6
0
 def open_spider(self, spider):
     assert self.has_capacity(), "No free spider slots when opening %r" % spider.name
     log.msg("Spider opened", spider=spider)
     yield self.scheduler.open_spider(spider)
     self.downloader.open_spider(spider)
     yield self.scraper.open_spider(spider)
     stats.open_spider(spider)
     yield send_catch_log_deferred(signals.spider_opened, spider=spider)
     self.next_request(spider)
Example #7
0
 def _itemproc_finished(self, output, item, response, spider):
     """ItemProcessor finished for the given ``item`` and returned ``output``
     """
     self.slots[spider].itemproc_size -= 1
     if isinstance(output, Failure):
         ex = output.value
         if isinstance(ex, DropItem):
             log.msg(log.formatter.dropped(item, ex, response, spider), \
                 level=log.WARNING, spider=spider)
             return send_catch_log_deferred(signal=signals.item_dropped, \
                 item=item, spider=spider, exception=output.value)
         else:
             log.err(output, 'Error processing %s' % item, spider=spider)
     else:
         log.msg(log.formatter.scraped(output, response, spider), \
             log.DEBUG, spider=spider)
         return send_catch_log_deferred(signal=signals.item_scraped, \
             item=output, response=response, spider=spider)
Example #8
0
 def open_spider(self, spider):
     assert self.has_capacity(), "No free spider slots when opening %r" % \
         spider.name
     log.msg("Spider opened", spider=spider)
     yield self.scheduler.open_spider(spider)
     self.downloader.open_spider(spider)
     yield self.scraper.open_spider(spider)
     stats.open_spider(spider)
     yield send_catch_log_deferred(signals.spider_opened, spider=spider)
     self.next_request(spider)
Example #9
0
 def _itemproc_finished(self, output, item, spider):
     """ItemProcessor finished for the given ``item`` and returned ``output``
     """
     self.sites[spider].itemproc_size -= 1
     if isinstance(output, Failure):
         ex = output.value
         if isinstance(ex, DropItem):
             log.msg(log.formatter.dropped(item, ex, spider), \
                 level=log.WARNING, spider=spider)
             return send_catch_log_deferred(signal=signals.item_dropped, \
                 item=item, spider=spider, exception=output.value)
         else:
             log.err(output, 'Error processing %s' % item, spider=spider)
     else:
         log.msg(log.formatter.passed(output, spider),
                 log.INFO,
                 spider=spider)
         # TODO: remove item_passed 'output' parameter for Scrapy 0.12
         return send_catch_log_deferred(signal=signals.item_passed, \
             item=output, spider=spider, output=output, original_item=item)
Example #10
0
    def _recv(self):
        self.running = True
        log.msg('recv begin: %s'%self.chnls,log.DEBUG)

        while self.running:
            try:
                chnl,msg = API.queue_bpop(self.chnls, timeout=5)
            except  redis.exceptions.ConnectionError, e:
                send_catch_log_deferred( 
                    signal=signals.ERROR,
                    sender=self
                )            
                self.stop()
            else:            
                if not chnl or not msg:    continue

                send_catch_log_deferred( 
                    signal=signals.RECV,
                    sender=self,
                    message=(chnl,msg)
                )
Example #11
0
 def open_spider(self, spider, start_requests=None, close_if_idle=True):
     assert self.has_capacity(), "No free spider slots when opening %r" % spider.name
     log.msg("Spider opened", spider=spider)
     nextcall = CallLaterOnce(self._next_request, spider)
     scheduler = self.scheduler_cls.from_settings(self.settings)
     slot = Slot(start_requests or (), close_if_idle, nextcall, scheduler)
     self.slots[spider] = slot
     yield scheduler.open(spider)
     yield self.scraper.open_spider(spider)
     stats.open_spider(spider)
     yield send_catch_log_deferred(signals.spider_opened, spider=spider)
     slot.nextcall.schedule()
Example #12
0
    def send_catch_log_deferred(self, signal, **kwargs):
        """
        Like :meth:`send_catch_log` but supports returning
        :class:`~twisted.internet.defer.Deferred` objects from signal handlers.

        Returns a Deferred that gets fired once all signal handlers
        deferreds were fired. Send a signal, catch exceptions and log them.

        The keyword arguments are passed to the signal handlers (connected
        through the :meth:`connect` method).
        """
        kwargs.setdefault("sender", self.sender)
        return _signal.send_catch_log_deferred(signal, **kwargs)
Example #13
0
 def _finish_closing_spider(self, spider):
     """This function is called after the spider has been closed"""
     reason = self.closing.pop(spider, "finished")
     call = self._next_request_calls.pop(spider, None)
     if call and call.active():
         call.cancel()
     dfd = send_catch_log_deferred(signal=signals.spider_closed, spider=spider, reason=reason)
     dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
     dfd.addErrback(log.err, "Unhandled error in stats.close_spider()", spider=spider)
     dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))
     dfd.addBoth(lambda _: self.closing_dfds.pop(spider).callback(spider))
     dfd.addBoth(lambda _: self._spider_closed_callback(spider))
     return dfd
Example #14
0
 def open_spider(self, spider, start_requests=None, close_if_idle=True):
     assert self.has_capacity(), "No free spider slots when opening %r" % \
         spider.name
     log.msg("Spider opened", spider=spider)
     nextcall = CallLaterOnce(self._next_request, spider)
     scheduler = self.scheduler_cls.from_settings(self.settings)
     slot = Slot(start_requests or (), close_if_idle, nextcall, scheduler)
     self.slots[spider] = slot
     yield scheduler.open(spider)
     yield self.scraper.open_spider(spider)
     stats.open_spider(spider)
     yield send_catch_log_deferred(signals.spider_opened, spider=spider)
     slot.nextcall.schedule()
Example #15
0
    def send_catch_log_deferred(self, signal, **kwargs):
        """
        Like :meth:`send_catch_log` but supports returning `deferreds`_ from
        signal handlers.

        Returns a Deferred that gets fired once all signal handlers
        deferreds were fired. Send a signal, catch exceptions and log them.

        The keyword arguments are passed to the signal handlers (connected
        through the :meth:`connect` method).

        .. _deferreds: http://twistedmatrix.com/documents/current/core/howto/defer.html
        """
        kwargs.setdefault('sender', self.sender)
        return _signal.send_catch_log_deferred(signal, **kwargs)
Example #16
0
    def send_catch_log_deferred(self, signal, **kwargs):
        """
        Like :meth:`send_catch_log` but supports returning `deferreds`_ from
        signal handlers.

        Returns a Deferred that gets fired once all signal handlers
        deferreds were fired. Send a signal, catch exceptions and log them.

        The keyword arguments are passed to the signal handlers (connected
        through the :meth:`connect` method).

        .. _deferreds: http://twistedmatrix.com/documents/current/core/howto/defer.html
        """
        kwargs.setdefault('sender', self.sender)
        return _signal.send_catch_log_deferred(signal, **kwargs)
Example #17
0
 def _finish_closing_spider(self, spider):
     """This function is called after the spider has been closed"""
     reason = self.closing.pop(spider, 'finished')
     call = self._next_request_calls.pop(spider, None)
     if call and call.active():
         call.cancel()
     dfd = send_catch_log_deferred(signal=signals.spider_closed, \
         spider=spider, reason=reason)
     dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
     dfd.addErrback(log.err,
                    "Unhandled error in stats.close_spider()",
                    spider=spider)
     dfd.addBoth(
         lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))
     dfd.addBoth(lambda _: self.closing_dfds.pop(spider).callback(spider))
     dfd.addBoth(lambda _: self._spider_closed_callback(spider))
     return dfd
Example #18
0
 def _process_spidermw_output(self, output, request, response, spider):
     """Process each Request/Item (given in the output parameter) returned
     from the given spider
     """
     if isinstance(output, Request):
         send_catch_log(signal=signals.request_received, request=output, \
             spider=spider)
         self.engine.crawl(request=output, spider=spider)
     elif isinstance(output, BaseItem):
         log.msg(log.formatter.scraped(output, request, response, spider), \
             level=log.DEBUG, spider=spider)
         self.sites[spider].itemproc_size += 1
         dfd = send_catch_log_deferred(signal=signals.item_scraped, \
             item=output, spider=spider, response=response)
         dfd.addBoth(lambda _: self.itemproc.process_item(output, spider))
         dfd.addBoth(self._itemproc_finished, output, spider)
         return dfd
     elif output is None:
         pass
     else:
         log.msg("Spider must return Request, BaseItem or None, got %r in %s" % \
             (type(output).__name__, request), log.ERROR, spider=spider)
Example #19
0
 def _process_spidermw_output(self, output, request, response, spider):
     """Process each Request/Item (given in the output parameter) returned
     from the given spider
     """
     if isinstance(output, Request):
         send_catch_log(signal=signals.request_received, request=output, \
             spider=spider)
         self.engine.crawl(request=output, spider=spider)
     elif isinstance(output, BaseItem):
         log.msg(log.formatter.scraped(output, request, response, spider), \
             level=log.DEBUG, spider=spider)
         self.sites[spider].itemproc_size += 1
         dfd = send_catch_log_deferred(signal=signals.item_scraped, \
             item=output, spider=spider, response=response)
         dfd.addBoth(lambda _: self.itemproc.process_item(output, spider))
         dfd.addBoth(self._itemproc_finished, output, spider)
         return dfd
     elif output is None:
         pass
     else:
         log.msg("Spider must return Request, BaseItem or None, got %r in %s" % \
             (type(output).__name__, request), log.ERROR, spider=spider)
Example #20
0
 def _finish_stopping_engine(self):
     yield send_catch_log_deferred(signal=signals.engine_stopped)
     yield stats.engine_stopped()
Example #21
0
 def _get_result(self, signal, *a, **kw):
     return send_catch_log_deferred(signal, *a, **kw)
Example #22
0
File: engine.py Project: tml/scrapy
 def start(self):
     """Start the execution engine"""
     assert not self.running, "Engine already running"
     self.start_time = time()
     yield send_catch_log_deferred(signal=signals.engine_started)
     self.running = True
Example #23
0
File: engine.py Project: tml/scrapy
 def _finish_stopping_engine(self):
     yield send_catch_log_deferred(signal=signals.engine_stopped)
     yield stats.engine_stopped()
Example #24
0
 def send_catch_log_deferred(self, *a, **kw):
     kw.setdefault('sender', self.sender)
     return signal.send_catch_log_deferred(*a, **kw)
Example #25
0
 def _get_result(self, signal, *a, **kw):
     return send_catch_log_deferred(signal, *a, **kw)
Example #26
0
 def start(self):
     """Start the execution engine"""
     assert not self.running, "Engine already running"
     self.start_time = time()
     yield send_catch_log_deferred(signal=signals.engine_started)
     self.running = True