Beispiel #1
0
 def __init__(self, settings, spider_closed_callback):
     self.settings = settings
     self.closing = {}  # dict (spider -> reason) of spiders being closed
     self.closing_dfds = {
     }  # dict (spider -> deferred) of spiders being closed
     self.running = False
     self.paused = False
     self._next_request_calls = {}
     self.scheduler = load_object(settings['SCHEDULER'])()
     self.downloader = Downloader()
     self.scraper = Scraper(self, self.settings)
     self._spider_closed_callback = spider_closed_callback
Beispiel #2
0
 def __init__(self, crawler, spider_closed_callback):
     self.settings = crawler.settings
     self.slots = {}
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     self.downloader = Downloader(crawler)
     self.scraper = Scraper(crawler)
     self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1)
     if self._concurrent_spiders != 1:
         warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \
             "Scrapyd max_proc config instead", ScrapyDeprecationWarning)
     self._spider_closed_callback = spider_closed_callback
Beispiel #3
0
 def configure(self):
     """
     Configure execution engine with the given scheduling policy and downloader.
     """
     self.scheduler = load_object(settings['SCHEDULER'])()
     self.spider_scheduler = load_object(settings['SPIDER_SCHEDULER'])()
     self.downloader = Downloader()
     self.scraper = Scraper(self)
     self.configured = True
Beispiel #4
0
 def configure(self, spider_closed_callback):
     """
     Configure execution engine with the given scheduling policy and downloader.
     """
     self.scheduler = load_object(settings['SCHEDULER'])()
     self.downloader = Downloader()
     self.scraper = Scraper(self)
     self.configured = True
     self._spider_closed_callback = spider_closed_callback
Beispiel #5
0
 def __init__(self, settings, spider_closed_callback):
     self.settings = settings
     self.slots = {}
     self.running = False
     self.paused = False
     self._next_request_calls = {}
     self.scheduler = load_object(settings['SCHEDULER'])()
     self.downloader = Downloader()
     self.scraper = Scraper(self, self.settings)
     self._spider_closed_callback = spider_closed_callback
Beispiel #6
0
 def __init__(self, crawler, spider_closed_callback):
     self.settings = crawler.settings
     self.slots = {}
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     self.downloader = Downloader(crawler)
     self.scraper = Scraper(crawler)
     self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS')
     self._spider_closed_callback = spider_closed_callback
Beispiel #7
0
 def __init__(self, settings, spider_closed_callback):
     self.settings = settings
     self.closing = {} # dict (spider -> reason) of spiders being closed
     self.closing_dfds = {} # dict (spider -> deferred) of spiders being closed
     self.running = False
     self.paused = False
     self._next_request_calls = {}
     self.scheduler = load_object(settings['SCHEDULER'])()
     self.downloader = Downloader()
     self.scraper = Scraper(self, self.settings)
     self._spider_closed_callback = spider_closed_callback
Beispiel #8
0
 def __init__(self, crawler, spider_closed_callback):
     self.settings = crawler.settings
     self.slots = {}
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings["SCHEDULER"])
     self.downloader = Downloader(crawler)
     self.scraper = Scraper(crawler)
     self._concurrent_spiders = self.settings.getint("CONCURRENT_SPIDERS", 1)
     if self._concurrent_spiders != 1:
         warnings.warn(
             "CONCURRENT_SPIDERS settings is deprecated, use " "Scrapyd max_proc config instead",
             ScrapyDeprecationWarning,
         )
     self._spider_closed_callback = spider_closed_callback
Beispiel #9
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     self.downloader = Downloader(crawler)
     self.scraper = Scraper(crawler)
     self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1)
     if self._concurrent_spiders != 1:
         warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \
             "Scrapyd max_proc config instead", ScrapyDeprecationWarning)
     self._spider_closed_callback = spider_closed_callback
Beispiel #10
0
 def __init__(self, crawler):
     Downloader.__init__(self, crawler)
     pass
Beispiel #11
0
class ExecutionEngine(object):

    def __init__(self, crawler, spider_closed_callback):
        self.settings = crawler.settings
        self.slots = {}
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        self.downloader = Downloader(crawler)
        self.scraper = Scraper(crawler)
        self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS')
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield send_catch_log_deferred(signal=signals.engine_started)
        self.running = True

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        try:
            slot = self.slots[spider]
        except KeyError:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = slot.start_requests.next()
                self.crawl(request, spider)
            except StopIteration:
                slot.start_requests = None

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slots[spider]
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slots[spider].needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slots[spider]
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(log.msg, spider=spider)
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(log.err, spider=spider)
        return d

    def spider_is_idle(self, spider):
        scraper_idle = spider in self.scraper.slots \
            and self.scraper.slots[spider].is_idle()
        pending = self.slots[spider].scheduler.has_pending_requests()
        downloading = bool(self.downloader.slots)
        idle = scraper_idle and not (pending or downloading)
        return idle

    @property
    def open_spiders(self):
        return self.slots.keys()

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return len(self.slots) < self._concurrent_spiders

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slots[spider].nextcall.schedule()

    def schedule(self, request, spider):
        return self.slots[spider].scheduler.enqueue_request(request)

    def download(self, request, spider):
        slot = self.slots[spider]
        slot.add_request(request)
        d = self._download(request, spider)
        d.addBoth(self._downloaded, slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slots[spider]
        slot.add_request(request)
        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                log.msg(log.formatter.crawled(request, response, spider), \
                    level=log.DEBUG, spider=spider)
                send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_error(failure):
            failure.request = request
            return failure

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success, _on_error)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=None, close_if_idle=True):
        assert self.has_capacity(), "No free spider slots when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_settings(self.settings)
        slot = Slot(start_requests or (), close_if_idle, nextcall, scheduler)
        self.slots[spider] = slot
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        stats.open_spider(spider)
        yield send_catch_log_deferred(signals.spider_opened, spider=spider)
        slot.nextcall.schedule()

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            self.slots[spider].nextcall.schedule(5)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slots[spider]
        if slot.closing:
            return slot.closing
        log.msg("Closing spider (%s)" % reason, spider=spider)

        dfd = slot.close()

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))

        dfd.addBoth(lambda _: self.slots.pop(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield send_catch_log_deferred(signal=signals.engine_stopped)
        yield stats.engine_stopped()
Beispiel #12
0
class ExecutionEngine(object):
    def __init__(self, settings, spider_closed_callback):
        self.settings = settings
        self.closing = {}  # dict (spider -> reason) of spiders being closed
        self.closing_dfds = {
        }  # dict (spider -> deferred) of spiders being closed
        self.running = False
        self.paused = False
        self._next_request_calls = {}
        self.scheduler = load_object(settings['SCHEDULER'])()
        self.downloader = Downloader()
        self.scraper = Scraper(self, self.settings)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield send_catch_log_deferred(signal=signals.engine_started)
        self.running = True

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def is_idle(self):
        return self.scheduler.is_idle() and self.downloader.is_idle() and \
            self.scraper.is_idle()

    def next_request(self, spider, now=False):
        """Scrape the next request for the spider passed.

        The next request to be scraped is retrieved from the scheduler and
        requested from the downloader.

        The spider is closed if there are no more pages to scrape.
        """
        if now:
            self._next_request_calls.pop(spider, None)
        elif spider not in self._next_request_calls:
            call = reactor.callLater(0, self.next_request, spider, now=True)
            self._next_request_calls[spider] = call
            return call
        else:
            return

        if self.paused:
            return reactor.callLater(5, self.next_request, spider)

        while not self._needs_backout(spider):
            if not self._next_request(spider):
                break

        if self.spider_is_idle(spider):
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        return not self.running \
            or self.spider_is_closed(spider) \
            or self.downloader.sites[spider].needs_backout() \
            or self.scraper.sites[spider].needs_backout()

    def _next_request(self, spider):
        # Next pending request from scheduler
        request, deferred = self.scheduler.next_request(spider)
        if request:
            dwld = mustbe_deferred(self.download, request, spider)
            dwld.chainDeferred(deferred).addBoth(lambda _: deferred)
            dwld.addErrback(log.err,
                            "Unhandled error on engine._next_request()",
                            spider=spider)
            return dwld

    def spider_is_idle(self, spider):
        scraper_idle = spider in self.scraper.sites \
            and self.scraper.sites[spider].is_idle()
        pending = self.scheduler.spider_has_pending_requests(spider)
        downloading = spider in self.downloader.sites \
            and self.downloader.sites[spider].active
        return scraper_idle and not (pending or downloading)

    def spider_is_closed(self, spider):
        """Return True if the spider is fully closed (ie. not even in the
        closing stage)"""
        return spider not in self.downloader.sites

    def spider_is_open(self, spider):
        """Return True if the spider is fully opened (ie. not in closing
        stage)"""
        return spider in self.downloader.sites and spider not in self.closing

    @property
    def open_spiders(self):
        return self.downloader.sites.keys()

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return len(self.downloader.sites) < self.downloader.concurrent_spiders

    def crawl(self, request, spider):
        if spider in self.closing:  # ignore requests for spiders being closed
            return
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        schd = mustbe_deferred(self.schedule, request, spider)
        # FIXME: we can't log errors because we would be preventing them from
        # propagating to the request errback. This should be fixed after the
        # next core refactoring.
        #schd.addErrback(log.err, "Error on engine.crawl()")
        schd.addBoth(self.scraper.enqueue_scrape, request, spider)
        schd.addErrback(log.err,
                        "Unhandled error on engine.crawl()",
                        spider=spider)
        schd.addBoth(lambda _: self.next_request(spider))

    def schedule(self, request, spider):
        if spider in self.closing:
            raise IgnoreRequest()
        self.next_request(spider)
        return self.scheduler.enqueue_request(spider, request)

    def download(self, request, spider):
        def _on_success(response):
            """handle the result of a page download"""
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request  # tie request to response received
                log.msg(log.formatter.crawled(request, response, spider), \
                    level=log.DEBUG, spider=spider)
                return response
            elif isinstance(response, Request):
                return mustbe_deferred(self.schedule, response, spider)

        def _on_error(_failure):
            """handle an error processing a page"""
            exc = _failure.value
            if isinstance(exc, IgnoreRequest):
                errmsg = _failure.getErrorMessage()
            else:
                errmsg = str(_failure)
            if errmsg:
                log.msg("Error downloading <%s>: %s" % (request.url, errmsg), \
                    level=log.ERROR, spider=spider)
            return Failure(IgnoreRequest(str(exc)))

        def _on_complete(_):
            self.next_request(spider)
            return _

        if spider not in self.downloader.sites:
            return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete)

        dwld = mustbe_deferred(self.downloader.fetch, request, spider)
        dwld.addCallbacks(_on_success, _on_error)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider):
        assert self.has_capacity(), "No free spider slots when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        yield self.scheduler.open_spider(spider)
        self.downloader.open_spider(spider)
        yield self.scraper.open_spider(spider)
        stats.open_spider(spider)
        yield send_catch_log_deferred(signals.spider_opened, spider=spider)
        self.next_request(spider)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            reactor.callLater(5, self.next_request, spider)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""
        if spider in self.closing:
            return defer.succeed(None)
        log.msg("Closing spider (%s)" % reason, spider=spider)
        self.closing[spider] = reason
        self.scheduler.clear_pending_requests(spider)
        dfd = self.downloader.close_spider(spider)
        self.closing_dfds[spider] = dfd
        dfd.addBoth(lambda _: self.scheduler.close_spider(spider))
        dfd.addErrback(log.err, "Unhandled error in scheduler.close_spider()", \
            spider=spider)
        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, "Unhandled error in scraper.close_spider()", \
            spider=spider)
        dfd.addBoth(lambda _: self._finish_closing_spider(spider))
        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dfds += self.closing_dfds.values()
        dlist = defer.DeferredList(dfds)
        return dlist

    def _finish_closing_spider(self, spider):
        """This function is called after the spider has been closed"""
        reason = self.closing.pop(spider, 'finished')
        call = self._next_request_calls.pop(spider, None)
        if call and call.active():
            call.cancel()
        dfd = send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason)
        dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err,
                       "Unhandled error in stats.close_spider()",
                       spider=spider)
        dfd.addBoth(
            lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))
        dfd.addBoth(lambda _: self.closing_dfds.pop(spider).callback(spider))
        dfd.addBoth(lambda _: self._spider_closed_callback(spider))
        return dfd

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield send_catch_log_deferred(signal=signals.engine_stopped)
Beispiel #13
0
class ExecutionEngine(object):

    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        self.downloader = Downloader(crawler)
        self.scraper = Scraper(crawler)
        self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1)
        if self._concurrent_spiders != 1:
            warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \
                "Scrapyd max_proc config instead", ScrapyDeprecationWarning)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception as exc:
                log.err(None, 'Obtaining request from start requests', \
                        spider=spider)
            else:
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(log.msg, spider=spider)
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(log.err, spider=spider)
        return d

    def spider_is_idle(self, spider):
        scraper_idle = self.scraper.slot.is_idle()
        pending = self.slot.scheduler.has_pending_requests()
        downloading = bool(self.downloader.active)
        idle = scraper_idle and not (pending or downloading)
        return idle

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                request=request, spider=spider)
        return self.slot.scheduler.enqueue_request(request)

    def download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        d = self._download(request, spider)
        d.addBoth(self._downloaded, slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                log.msg(spider=spider, **logkws)
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
        slot.nextcall.schedule()

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            self.slot.nextcall.schedule(5)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        log.msg(format="Closing spider (%(reason)s)", reason=reason, spider=spider)

        dfd = slot.close()

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log.err, spider=spider)

        # XXX: spider_stats argument was added for backwards compatibility with
        # stats collection refactoring added in 0.15. it should be removed in 0.17.
        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason, spider_stats=self.crawler.stats.get_stats()))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: log.msg(format="Spider closed (%(reason)s)", reason=reason, spider=spider))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
        self._closewait.callback(None)
Beispiel #14
0
class ExecutionEngine(object):

    def __init__(self, settings, spider_closed_callback):
        self.settings = settings
        self.closing = {} # dict (spider -> reason) of spiders being closed
        self.closing_dfds = {} # dict (spider -> deferred) of spiders being closed
        self.running = False
        self.paused = False
        self._next_request_calls = {}
        self.scheduler = load_object(settings['SCHEDULER'])()
        self.downloader = Downloader()
        self.scraper = Scraper(self, self.settings)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield send_catch_log_deferred(signal=signals.engine_started)
        self.running = True

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def is_idle(self):
        return self.scheduler.is_idle() and self.downloader.is_idle() and \
            self.scraper.is_idle()

    def next_request(self, spider, now=False):
        """Scrape the next request for the spider passed.

        The next request to be scraped is retrieved from the scheduler and
        requested from the downloader.

        The spider is closed if there are no more pages to scrape.
        """
        if now:
            self._next_request_calls.pop(spider, None)
        elif spider not in self._next_request_calls:
            call = reactor.callLater(0, self.next_request, spider, now=True)
            self._next_request_calls[spider] = call
            return call
        else:
            return

        if self.paused:
            return reactor.callLater(5, self.next_request, spider)

        while not self._needs_backout(spider):
            if not self._next_request(spider):
                break

        if self.spider_is_idle(spider):
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        return not self.running \
            or self.spider_is_closed(spider) \
            or self.downloader.sites[spider].needs_backout() \
            or self.scraper.sites[spider].needs_backout()

    def _next_request(self, spider):
        # Next pending request from scheduler
        request, deferred = self.scheduler.next_request(spider)
        if request:
            dwld = mustbe_deferred(self.download, request, spider)
            dwld.chainDeferred(deferred).addBoth(lambda _: deferred)
            dwld.addErrback(log.err, "Unhandled error on engine._next_request()",
                spider=spider)
            return dwld

    def spider_is_idle(self, spider):
        scraper_idle = spider in self.scraper.sites \
            and self.scraper.sites[spider].is_idle()
        pending = self.scheduler.spider_has_pending_requests(spider)
        downloading = spider in self.downloader.sites \
            and self.downloader.sites[spider].active
        return scraper_idle and not (pending or downloading)

    def spider_is_closed(self, spider):
        """Return True if the spider is fully closed (ie. not even in the
        closing stage)"""
        return spider not in self.downloader.sites

    def spider_is_open(self, spider):
        """Return True if the spider is fully opened (ie. not in closing
        stage)"""
        return spider in self.downloader.sites and spider not in self.closing

    @property
    def open_spiders(self):
        return self.downloader.sites.keys()

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return len(self.downloader.sites) < self.downloader.concurrent_spiders

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        if spider in self.closing: # ignore requests for spiders being closed
            return
        schd = mustbe_deferred(self.schedule, request, spider)
        # FIXME: we can't log errors because we would be preventing them from
        # propagating to the request errback. This should be fixed after the
        # next core refactoring.
        #schd.addErrback(log.err, "Error on engine.crawl()")
        schd.addBoth(self.scraper.enqueue_scrape, request, spider)
        schd.addErrback(log.err, "Unhandled error on engine.crawl()", spider=spider)
        schd.addBoth(lambda _: self.next_request(spider))

    def schedule(self, request, spider):
        if spider in self.closing:
            raise IgnoreRequest()
        self.next_request(spider)
        return self.scheduler.enqueue_request(spider, request)

    def download(self, request, spider):
        def _on_success(response):
            """handle the result of a page download"""
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                log.msg(log.formatter.crawled(request, response, spider), \
                    level=log.DEBUG, spider=spider)
                return response
            elif isinstance(response, Request):
                return mustbe_deferred(self.schedule, response, spider)

        def _on_error(_failure):
            """handle an error processing a page"""
            exc = _failure.value
            if isinstance(exc, IgnoreRequest):
                errmsg = _failure.getErrorMessage()
                level = exc.level
            else:
                errmsg = str(_failure)
                level = log.ERROR
            if errmsg:
                log.msg("Error downloading <%s>: %s" % (request.url, errmsg), \
                    level=level, spider=spider)
            return Failure(IgnoreRequest(str(exc)))

        def _on_complete(_):
            self.next_request(spider)
            return _

        if spider not in self.downloader.sites:
            return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete)

        dwld = mustbe_deferred(self.downloader.fetch, request, spider)
        dwld.addCallbacks(_on_success, _on_error)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider):
        assert self.has_capacity(), "No free spider slots when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        yield self.scheduler.open_spider(spider)
        self.downloader.open_spider(spider)
        yield self.scraper.open_spider(spider)
        stats.open_spider(spider)
        yield send_catch_log_deferred(signals.spider_opened, spider=spider)
        self.next_request(spider)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            reactor.callLater(5, self.next_request, spider)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""
        if spider in self.closing:
            return defer.succeed(None)
        log.msg("Closing spider (%s)" % reason, spider=spider)
        self.closing[spider] = reason
        self.scheduler.clear_pending_requests(spider)
        dfd = self.downloader.close_spider(spider)
        self.closing_dfds[spider] = dfd
        dfd.addBoth(lambda _: self.scheduler.close_spider(spider))
        dfd.addErrback(log.err, "Unhandled error in scheduler.close_spider()", \
            spider=spider)
        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, "Unhandled error in scraper.close_spider()", \
            spider=spider)
        dfd.addBoth(lambda _: self._finish_closing_spider(spider))
        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dfds += self.closing_dfds.values()
        dlist = defer.DeferredList(dfds)
        return dlist

    def _finish_closing_spider(self, spider):
        """This function is called after the spider has been closed"""
        reason = self.closing.pop(spider, 'finished')
        call = self._next_request_calls.pop(spider, None)
        if call and call.active():
            call.cancel()
        dfd = send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason)
        dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, "Unhandled error in stats.close_spider()",
            spider=spider)
        dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))
        dfd.addBoth(lambda _: self.closing_dfds.pop(spider).callback(spider))
        dfd.addBoth(lambda _: self._spider_closed_callback(spider))
        return dfd

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield send_catch_log_deferred(signal=signals.engine_stopped)
Beispiel #15
0
class ExecutionEngine(object):

    def __init__(self, settings, spider_closed_callback):
        self.settings = settings
        self.slots = {}
        self.running = False
        self.paused = False
        self._next_request_calls = {}
        self.scheduler = load_object(settings['SCHEDULER'])()
        self.downloader = Downloader()
        self.scraper = Scraper(self, self.settings)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield send_catch_log_deferred(signal=signals.engine_started)
        self.running = True

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def is_idle(self):
        return self.scheduler.is_idle() and self.downloader.is_idle() and \
            self.scraper.is_idle()

    def next_request(self, spider, now=False):
        """Scrape the next request for the spider passed.

        The next request to be scraped is retrieved from the scheduler and
        requested from the downloader.

        The spider is closed if there are no more pages to scrape.
        """
        if now:
            self._next_request_calls.pop(spider, None)
        elif spider not in self._next_request_calls:
            call = reactor.callLater(0, self.next_request, spider, now=True)
            self._next_request_calls[spider] = call
            return call
        else:
            return

        if self.paused:
            return reactor.callLater(5, self.next_request, spider)

        while not self._needs_backout(spider):
            if not self._next_request(spider):
                break

        if self.spider_is_idle(spider):
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slots[spider]
        return not self.running \
            or slot.closing \
            or self.spider_is_closed(spider) \
            or self.downloader.sites[spider].needs_backout() \
            or self.scraper.sites[spider].needs_backout()

    def _next_request(self, spider):
        request = self.scheduler.next_request(spider)
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(log.msg, spider=spider)
        slot = self.slots[spider]
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: self.next_request(spider))
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = defer.Deferred()
        d.addBoth(self.scraper.enqueue_scrape, request, spider)
        d.addErrback(log.err, spider=spider)
        if isinstance(response, Failure):
            d.errback(response)
        else:
            d.callback(response)
        return d

    def spider_is_idle(self, spider):
        scraper_idle = spider in self.scraper.sites \
            and self.scraper.sites[spider].is_idle()
        pending = self.scheduler.spider_has_pending_requests(spider)
        downloading = spider in self.downloader.sites \
            and self.downloader.sites[spider].active
        return scraper_idle and not (pending or downloading)

    def spider_is_closed(self, spider):
        """Return True if the spider is fully closed (ie. not even in the
        closing stage)"""
        return spider not in self.downloader.sites

    @property
    def open_spiders(self):
        return self.downloader.sites.keys()

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return len(self.downloader.sites) < self.downloader.concurrent_spiders

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.next_request(spider)

    def schedule(self, request, spider):
        return self.scheduler.enqueue_request(spider, request)

    def download(self, request, spider):
        slot = self.slots[request]
        slot.add_request(request)
        if isinstance(request, Response):
            return request
        d = self._download(request, spider)
        d.addCallback(self.download, spider)
        d.addBoth(self._remove_request, slot, request)
        return d

    def _remove_request(self, _, slot, request):
        slot.remove_request(request)
        return _

    def _download(self, request, spider):
        slot = self.slots[spider]
        slot.add_request(request)
        def _on_success(response):
            """handle the result of a page download"""
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                log.msg(log.formatter.crawled(request, response, spider), \
                    level=log.DEBUG, spider=spider)
                send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            self.next_request(spider)
            return _

        dwld = mustbe_deferred(self.downloader.fetch, request, spider)
        dwld.addCallback(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider):
        assert self.has_capacity(), "No free spider slots when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        self.slots[spider] = Slot()
        yield self.scheduler.open_spider(spider)
        self.downloader.open_spider(spider)
        yield self.scraper.open_spider(spider)
        stats.open_spider(spider)
        yield send_catch_log_deferred(signals.spider_opened, spider=spider)
        self.next_request(spider)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            reactor.callLater(5, self.next_request, spider)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slots[spider]
        if slot.closing:
            return slot.closing
        log.msg("Closing spider (%s)" % reason, spider=spider)

        self.scheduler.clear_pending_requests(spider)

        dfd = slot.close()

        dfd.addBoth(lambda _: self.downloader.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self.scheduler.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._cancel_next_call(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))

        dfd.addBoth(lambda _: self.slots.pop(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _cancel_next_call(self, spider):
        call = self._next_request_calls.pop(spider, None)
        if call and call.active:
            call.cancel()

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield send_catch_log_deferred(signal=signals.engine_stopped)
        yield stats.engine_stopped()
Beispiel #16
0
class ExecutionEngine(object):

    def __init__(self, crawler, spider_closed_callback):
        self.settings = crawler.settings
        self.slots = {}
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        self.downloader = Downloader(crawler)
        self.scraper = Scraper(crawler)
        self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1)
        if self._concurrent_spiders != 1:
            warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \
                "Scrapyd max_proc config instead", ScrapyDeprecationWarning)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield send_catch_log_deferred(signal=signals.engine_started)
        self.running = True

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        try:
            slot = self.slots[spider]
        except KeyError:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = slot.start_requests.next()
                self.crawl(request, spider)
            except StopIteration:
                slot.start_requests = None

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slots[spider]
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slots[spider].needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slots[spider]
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(log.msg, spider=spider)
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(log.err, spider=spider)
        return d

    def spider_is_idle(self, spider):
        scraper_idle = spider in self.scraper.slots \
            and self.scraper.slots[spider].is_idle()
        pending = self.slots[spider].scheduler.has_pending_requests()
        downloading = bool(self.downloader.slots)
        idle = scraper_idle and not (pending or downloading)
        return idle

    @property
    def open_spiders(self):
        return self.slots.keys()

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return len(self.slots) < self._concurrent_spiders

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slots[spider].nextcall.schedule()

    def schedule(self, request, spider):
        return self.slots[spider].scheduler.enqueue_request(request)

    def download(self, request, spider):
        slot = self.slots[spider]
        slot.add_request(request)
        d = self._download(request, spider)
        d.addBoth(self._downloaded, slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slots[spider]
        slot.add_request(request)
        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                log.msg(log.formatter.crawled(request, response, spider), \
                    level=log.DEBUG, spider=spider)
                send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_error(failure):
            failure.request = request
            return failure

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success, _on_error)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=None, close_if_idle=True):
        assert self.has_capacity(), "No free spider slots when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_settings(self.settings)
        slot = Slot(start_requests or (), close_if_idle, nextcall, scheduler)
        self.slots[spider] = slot
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        stats.open_spider(spider)
        yield send_catch_log_deferred(signals.spider_opened, spider=spider)
        slot.nextcall.schedule()

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            self.slots[spider].nextcall.schedule(5)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slots[spider]
        if slot.closing:
            return slot.closing
        log.msg("Closing spider (%s)" % reason, spider=spider)

        dfd = slot.close()

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))

        dfd.addBoth(lambda _: self.slots.pop(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield send_catch_log_deferred(signal=signals.engine_stopped)
        yield stats.engine_stopped()
 def __init__(self, crawler):
     Downloader.__init__(self, crawler)
     pass
Beispiel #18
0
class ExecutionEngine(object):

    def __init__(self):
        self.configured = False
        self.closing = {} # dict (spider -> reason) of spiders being closed
        self.running = False
        self.killed = False
        self.paused = False
        self._next_request_calls = {}
        self._crawled_logline = load_object(settings['LOG_FORMATTER_CRAWLED'])

    def configure(self, spider_closed_callback):
        """
        Configure execution engine with the given scheduling policy and downloader.
        """
        self.scheduler = load_object(settings['SCHEDULER'])()
        self.downloader = Downloader()
        self.scraper = Scraper(self)
        self.configured = True
        self._spider_closed_callback = spider_closed_callback

    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        send_catch_log(signal=signals.engine_started, sender=self.__class__)
        self.running = True

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def kill(self):
        """Forces shutdown without waiting for pending transfers to finish.
        stop() must have been called first
        """
        assert not self.running, "Call engine.stop() before engine.kill()"
        self.killed = True

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def is_idle(self):
        return self.scheduler.is_idle() and self.downloader.is_idle() and \
            self.scraper.is_idle()

    def next_request(self, spider, now=False):
        """Scrape the next request for the spider passed.

        The next request to be scraped is retrieved from the scheduler and
        requested from the downloader.

        The spider is closed if there are no more pages to scrape.
        """
        if now:
            self._next_request_calls.pop(spider, None)
        elif spider not in self._next_request_calls:
            call = reactor.callLater(0, self.next_request, spider, now=True)
            self._next_request_calls[spider] = call
            return call
        else:
            return

        if self.paused:
            return reactor.callLater(5, self.next_request, spider)

        while not self._needs_backout(spider):
            if not self._next_request(spider):
                break

        if self.spider_is_idle(spider):
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        return not self.running \
            or self.spider_is_closed(spider) \
            or self.downloader.sites[spider].needs_backout() \
            or self.scraper.sites[spider].needs_backout()

    def _next_request(self, spider):
        # Next pending request from scheduler
        request, deferred = self.scheduler.next_request(spider)
        if request:
            dwld = mustbe_deferred(self.download, request, spider)
            dwld.chainDeferred(deferred).addBoth(lambda _: deferred)
            dwld.addErrback(log.err, "Unhandled error on engine._next_request()",
                spider=spider)
            return dwld

    def spider_is_idle(self, spider):
        scraper_idle = spider in self.scraper.sites \
            and self.scraper.sites[spider].is_idle()
        pending = self.scheduler.spider_has_pending_requests(spider)
        downloading = spider in self.downloader.sites \
            and self.downloader.sites[spider].active
        return scraper_idle and not (pending or downloading)

    def spider_is_closed(self, spider):
        """Return True if the spider is fully closed (ie. not even in the
        closing stage)"""
        return spider not in self.downloader.sites

    def spider_is_open(self, spider):
        """Return True if the spider is fully opened (ie. not in closing
        stage)"""
        return spider in self.downloader.sites and spider not in self.closing

    @property
    def open_spiders(self):
        return self.downloader.sites.keys()

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return len(self.downloader.sites) < self.downloader.concurrent_spiders

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        if spider in self.closing: # ignore requests for spiders being closed
            return
        schd = mustbe_deferred(self.schedule, request, spider)
        # FIXME: we can't log errors because we would be preventing them from
        # propagating to the request errback. This should be fixed after the
        # next core refactoring.
        #schd.addErrback(log.err, "Error on engine.crawl()")
        schd.addBoth(self.scraper.enqueue_scrape, request, spider)
        schd.addErrback(log.err, "Unhandled error on engine.crawl()", spider=spider)
        schd.addBoth(lambda _: self.next_request(spider))

    def schedule(self, request, spider):
        if spider in self.closing:
            raise IgnoreRequest()
        self.next_request(spider)
        return self.scheduler.enqueue_request(spider, request)

    def download(self, request, spider):
        def _on_success(response):
            """handle the result of a page download"""
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                log.msg(self._crawled_logline(request, response), \
                    level=log.DEBUG, spider=spider)
                return response
            elif isinstance(response, Request):
                newrequest = response
                dfd = mustbe_deferred(self.schedule, newrequest, spider)
                if newrequest.callback:
                    # XXX: this is a bit hacky and should be removed
                    dfd.addCallbacks(newrequest.callback, newrequest.errback)
                return dfd

        def _on_error(_failure):
            """handle an error processing a page"""
            exc = _failure.value
            if isinstance(exc, IgnoreRequest):
                errmsg = _failure.getErrorMessage()
                level = exc.level
            else:
                errmsg = str(_failure)
                level = log.ERROR
            if errmsg:
                log.msg("Crawling <%s>: %s" % (request.url, errmsg), \
                    level=level, spider=spider)
            return Failure(IgnoreRequest(str(exc)))

        def _on_complete(_):
            self.next_request(spider)
            return _

        if spider not in self.downloader.sites:
            return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete)

        dwld = mustbe_deferred(self.downloader.fetch, request, spider)
        dwld.addCallbacks(_on_success, _on_error)
        dwld.addBoth(_on_complete)
        return dwld

    def open_spider(self, spider):
        assert self.has_capacity(), "No free spider slots when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        self.scheduler.open_spider(spider)
        self.downloader.open_spider(spider)
        self.scraper.open_spider(spider)
        stats.open_spider(spider)
        send_catch_log(signals.spider_opened, sender=self.__class__, spider=spider)
        self.next_request(spider)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        try:
            dispatcher.send(signal=signals.spider_idle, sender=self.__class__, \
                spider=spider)
        except DontCloseSpider:
            reactor.callLater(5, self.next_request, spider)
            return
        except Exception, e:
            log.msg("Exception caught on 'spider_idle' signal dispatch: %r" % e, \
                level=log.ERROR)
        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')