Example #1
0
 def __init__(self, engine, settings):
     self.sites = {}
     self.spidermw = SpiderMiddlewareManager.from_settings(settings)
     itemproc_cls = load_object(settings['ITEM_PROCESSOR'])
     self.itemproc = itemproc_cls.from_settings(settings)
     self.concurrent_items = settings.getint('CONCURRENT_ITEMS')
     self.engine = engine
Example #2
0
 def __init__(self, crawler):
     self.slots = {}
     self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
     itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
     self.itemproc = itemproc_cls.from_crawler(crawler)
     self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
     self.crawler = crawler
Example #3
0
 def __init__(self, crawler):
     super(Scraper, self).__init__(crawler.metas)
     self.slots = {}
     self.spidermw = SpiderMiddlewareManager(crawler.metas)
     itemproc_cls = load_object(self.item_processor.to_value())
     self.itemproc = itemproc_cls(self.metas)
     self.concurrent_items = self.concurrent_items.to_value()
     self.crawler = crawler
Example #4
0
 def __init__(self, crawler):
     self.slot = None
     self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
     itemproc_cls = load_object(crawler.settings["ITEM_PROCESSOR"])
     self.itemproc = itemproc_cls.from_crawler(crawler)
     self.concurrent_items = crawler.settings.getint("CONCURRENT_ITEMS")
     self.crawler = crawler
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
Example #5
0
    def __init__(self, crawler):
        self.slot = None
        self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)

        # ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
        itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
        self.itemproc = itemproc_cls.from_crawler(crawler)

        # CONCURRENT_ITEMS = 100
        self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
        self.crawler = crawler
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
Example #6
0
    def setUp(self):
        class TestSpider(Spider):
            name = 'test'

        self.spider = TestSpider
        scrapy_default_middlewares = {
            'scrapy.spidermiddlewares.referer.RefererMiddleware': 700
        }

        # monkey patch SPIDER_MIDDLEWARES_BASE to include only referer middleware
        sys.modules['scrapy.settings.default_settings'].SPIDER_MIDDLEWARES_BASE = scrapy_default_middlewares

        custom_settings = {
            'SPIDER_MIDDLEWARES': {'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000}
        }
        crawler = get_crawler(self.spider, custom_settings)
        self.add_frontera_scheduler(crawler)
        self.smw = SpiderMiddlewareManager.from_crawler(crawler)
    def __init__(self, crawler):
        self.slot = None
        self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
        itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
        #ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager'
        #ItemPipelineManager是一个MiddlewareManager的派生类
        #加入了一个功能:将pipeline中process_item方法添加到回调链中。_add_middleware
        #然后以callback(spider)调用回调链。(pipelinemanager.process_item)

        self.itemproc = itemproc_cls.from_crawler(crawler)
        #管道管理器实例化。

        self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
        #默认100

        self.crawler = crawler
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
Example #8
0
class Scraper(object):

    def __init__(self, engine):
        self.sites = {}
        self.spidermw = SpiderMiddlewareManager()
        self.itemproc = load_object(settings['ITEM_PROCESSOR'])()
        self.concurrent_items = settings.getint('CONCURRENT_ITEMS')
        self.engine = engine

    def open_spider(self, spider):
        """Open the given spider for scraping and allocate resources for it"""
        assert spider not in self.sites, "Spider already opened: %s" % spider
        self.sites[spider] = SpiderInfo()
        self.itemproc.open_spider(spider)

    def close_spider(self, spider):
        """Close a spider being scraped and release its resources"""
        assert spider in self.sites, "Spider not opened: %s" % spider
        site = self.sites[spider]
        site.closing = defer.Deferred()
        self.itemproc.close_spider(spider)

    def is_idle(self):
        """Return True if there isn't any more spiders to process"""
        return not self.sites

    def enqueue_scrape(self, response, request, spider):
        site = self.sites[spider]
        dfd = site.add_response_request(response, request)
        # FIXME: this can't be called here because the stats spider may be
        # already closed
        #stats.max_value('scraper/max_active_size', site.active_size, \
        #    spider=spider)
        def finish_scraping(_):
            site.finish_response(response)
            if site.closing and site.is_idle():
                del self.sites[spider]
                site.closing.callback(None)
            self._scrape_next(spider, site)
            return _
        dfd.addBoth(finish_scraping)
        dfd.addErrback(log.err, 'Scraper bug processing %s' % request, \
            spider=spider)
        self._scrape_next(spider, site)
        return dfd

    def _scrape_next(self, spider, site):
        while site.queue:
            response, request, deferred = site.next_response_request_deferred()
            self._scrape(response, request, spider).chainDeferred(deferred)

    def _scrape(self, response, request, spider):
        """Handle the downloaded response or failure trough the spider
        callback/errback"""
        assert isinstance(response, (Response, Failure))

        dfd = self._scrape2(response, request, spider) # returns spiders processed output
        dfd.addErrback(self.handle_spider_error, request, spider)
        dfd.addCallback(self.handle_spider_output, request, response, spider)
        return dfd

    def _scrape2(self, request_result, request, spider):
        """Handle the diferent cases of request's result been a Response or a
        Failure"""
        if not isinstance(request_result, Failure):
            return self.spidermw.scrape_response(self.call_spider, \
                request_result, request, spider)
        else:
            # FIXME: don't ignore errors in spider middleware
            dfd = self.call_spider(request_result, request, spider)
            return dfd.addErrback(self._check_propagated_failure, \
                request_result, request, spider)

    def call_spider(self, result, request, spider):
        dfd = defer_result(result)
        dfd.addCallbacks(request.callback or spider.parse, request.errback)
        return dfd.addCallback(iterate_spider_output)

    def handle_spider_error(self, _failure, request, spider, propagated_failure=None):
        referer = request.headers.get('Referer', None)
        msg = "Spider exception caught while processing <%s> (referer: <%s>): %s" % \
            (request.url, referer, _failure)
        log.msg(msg, log.ERROR, spider=spider)
        stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \
            spider=spider)

    def handle_spider_output(self, result, request, response, spider):
        if not result:
            return defer_succeed(None)
        dfd = parallel(iter(result), self.concurrent_items,
            self._process_spidermw_output, request, response, spider)
        return dfd

    def _process_spidermw_output(self, output, request, response, spider):
        """Process each Request/Item (given in the output parameter) returned
        from the given spider
        """
        # TODO: keep closing state internally instead of checking engine
        if spider in self.engine.closing:
            return
        elif isinstance(output, Request):
            send_catch_log(signal=signals.request_received, request=output, \
                spider=spider)
            self.engine.crawl(request=output, spider=spider)
        elif isinstance(output, BaseItem):
            log.msg("Scraped %s in <%s>" % (output, request.url), level=log.DEBUG, \
                spider=spider)
            send_catch_log(signal=signals.item_scraped, sender=self.__class__, \
                item=output, spider=spider, response=response)
            self.sites[spider].itemproc_size += 1
            # FIXME: this can't be called here because the stats spider may be
            # already closed
            #stats.max_value('scraper/max_itemproc_size', \
            #        self.sites[spider].itemproc_size, spider=spider)
            dfd = self.itemproc.process_item(output, spider)
            dfd.addBoth(self._itemproc_finished, output, spider)
            return dfd
        elif output is None:
            pass
        else:
            log.msg("Spider must return Request, BaseItem or None, got %r in %s" % \
                (type(output).__name__, request), log.ERROR, spider=spider)

    def _check_propagated_failure(self, spider_failure, propagated_failure, request, spider):
        """Log and silence the bugs raised outside of spiders, but still allow
        spiders to be notified about general failures while downloading spider
        generated requests
        """
        # ignored requests are commonly propagated exceptions safes to be silenced
        if isinstance(spider_failure.value, IgnoreRequest):
            return
        elif spider_failure is propagated_failure:
            log.err(spider_failure, 'Unhandled error propagated to spider', \
                spider=spider)
            return # stop propagating this error
        else:
            return spider_failure # exceptions raised in the spider code

    def _itemproc_finished(self, output, item, spider):
        """ItemProcessor finished for the given ``item`` and returned ``output``
        """
        self.sites[spider].itemproc_size -= 1
        if isinstance(output, Failure):
            ex = output.value
            if isinstance(ex, DropItem):
                log.msg("Dropped %s - %s" % (item, str(ex)), level=log.WARNING, spider=spider)
                send_catch_log(signal=signals.item_dropped, sender=self.__class__, \
                    item=item, spider=spider, exception=output.value)
            else:
                log.msg('Error processing %s - %s' % (item, output), \
                    log.ERROR, spider=spider)
        else:
            log.msg("Passed %s" % item, log.INFO, spider=spider)
            send_catch_log(signal=signals.item_passed, sender=self.__class__, \
                item=item, spider=spider, output=output)
Example #9
0
class Scraper(SettingObject):

    item_processor = StringField(default="scrapy.contrib.pipeline.ItemPipelineManager")
    concurrent_items = IntegerField(default=100)

    def __init__(self, crawler):
        super(Scraper, self).__init__(crawler.metas)
        self.slots = {}
        self.spidermw = SpiderMiddlewareManager(crawler.metas)
        itemproc_cls = load_object(self.item_processor.to_value())
        self.itemproc = itemproc_cls(self.metas)
        self.concurrent_items = self.concurrent_items.to_value()
        self.crawler = crawler

    @defer.inlineCallbacks
    def open_spider(self, spider):
        """Open the given spider for scraping and allocate resources for it"""
        assert spider not in self.slots, "Spider already opened: %s" % spider
        self.slots[spider] = Slot()
        yield self.itemproc.open_spider(spider)

    def close_spider(self, spider):
        """Close a spider being scraped and release its resources"""
        assert spider in self.slots, "Spider not opened: %s" % spider
        slot = self.slots[spider]
        slot.closing = defer.Deferred()
        slot.closing.addCallback(self.itemproc.close_spider)
        self._check_if_closing(spider, slot)
        return slot.closing

    def is_idle(self):
        """Return True if there isn't any more spiders to process"""
        return not self.slots

    def _check_if_closing(self, spider, slot):
        if slot.closing and slot.is_idle():
            del self.slots[spider]
            slot.closing.callback(spider)

    def enqueue_scrape(self, response, request, spider):
        slot = self.slots[spider]
        dfd = slot.add_response_request(response, request)

        def finish_scraping(_):
            slot.finish_response(response, request)
            self._check_if_closing(spider, slot)
            self._scrape_next(spider, slot)
            return _

        dfd.addBoth(finish_scraping)
        dfd.addErrback(log.err, "Scraper bug processing %s" % request, spider=spider)
        self._scrape_next(spider, slot)
        return dfd

    def _scrape_next(self, spider, slot):
        while slot.queue:
            response, request, deferred = slot.next_response_request_deferred()
            self._scrape(response, request, spider).chainDeferred(deferred)

    def _scrape(self, response, request, spider):
        """Handle the downloaded response or failure trough the spider
        callback/errback"""
        assert isinstance(response, (Response, Failure))

        dfd = self._scrape2(response, request, spider)  # returns spiders processed output
        dfd.addErrback(self.handle_spider_error, request, response, spider)
        dfd.addCallback(self.handle_spider_output, request, response, spider)
        return dfd

    def _scrape2(self, request_result, request, spider):
        """Handle the diferent cases of request's result been a Response or a
        Failure"""
        if not isinstance(request_result, Failure):
            return self.spidermw.scrape_response(self.call_spider, request_result, request, spider)
        else:
            # FIXME: don't ignore errors in spider middleware
            dfd = self.call_spider(request_result, request, spider)
            return dfd.addErrback(self._log_download_errors, request_result, request, spider)

    def call_spider(self, result, request, spider):
        dfd = defer_result(result)
        dfd.addCallbacks(request.callback or spider.parse, request.errback)
        return dfd.addCallback(iterate_spider_output)

    def handle_spider_error(self, _failure, request, response, spider):
        exc = _failure.value
        if isinstance(exc, CloseSpider):
            self.crawler.engine.close_spider(spider, exc.reason or "cancelled")
            return
        log.err(_failure, "Spider error processing %s" % request, spider=spider)
        send_catch_log(signal=signals.spider_error, failure=_failure, response=response, spider=spider)
        stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider)

    def handle_spider_output(self, result, request, response, spider):
        if not result:
            return defer_succeed(None)
        it = iter_errback(result, self.handle_spider_error, request, response, spider)
        dfd = parallel(it, self.concurrent_items, self._process_spidermw_output, request, response, spider)
        return dfd

    def _process_spidermw_output(self, output, request, response, spider):
        """Process each Request/Item (given in the output parameter) returned
        from the given spider
        """
        if isinstance(output, Request):
            send_catch_log(signal=signals.request_received, request=output, spider=spider)
            self.crawler.engine.crawl(request=output, spider=spider)
        elif isinstance(output, BaseItem):
            self.slots[spider].itemproc_size += 1
            dfd = self.itemproc.process_item(output, spider)
            dfd.addBoth(self._itemproc_finished, output, response, spider)
            return dfd
        elif output is None:
            pass
        else:
            log.msg(
                "Spider must return Request, BaseItem or None, got %r in %s" % (type(output).__name__, request),
                log.ERROR,
                spider=spider,
            )

    def _log_download_errors(self, spider_failure, download_failure, request, spider):
        """Log and silence errors that come from the engine (typically download
        errors that got propagated thru here)
        """
        if spider_failure is download_failure:
            log.msg("Error downloading %s: %s" % (request, spider_failure.getErrorMessage()), log.ERROR, spider=spider)
            return
        return spider_failure

    def _itemproc_finished(self, output, item, response, spider):
        """ItemProcessor finished for the given ``item`` and returned ``output``
        """
        self.slots[spider].itemproc_size -= 1
        if isinstance(output, Failure):
            ex = output.value
            if isinstance(ex, DropItem):
                log.msg(log.formatter.dropped(item, ex, response, spider), level=log.WARNING, spider=spider)
                return send_catch_log_deferred(
                    signal=signals.item_dropped, item=item, spider=spider, exception=output.value
                )
            else:
                log.err(output, "Error processing %s" % item, spider=spider)
        else:
            log.msg(log.formatter.scraped(output, response, spider), log.DEBUG, spider=spider)
            return send_catch_log_deferred(signal=signals.item_scraped, item=output, response=response, spider=spider)
Example #10
0
 def setUp(self):
     self.request = Request('http://example.com/index.html')
     self.response = Response(self.request.url, request=self.request)
     self.crawler = get_crawler(Spider)
     self.spider = self.crawler._create_spider('foo')
     self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)