Example #1
0
 def _logerror(self, failure, request, spider):
     if failure.type is not IgnoreRequest:
         logger.error("Error downloading %(request)s: %(f_exception)s",
                      {'request': request, 'f_exception': failure.value},
                      exc_info=failure_to_exc_info(failure),
                      extra={'spider': spider})
     return failure
Example #2
0
 def logerror(failure, recv):
     if dont_log is None or not isinstance(failure.value, dont_log):
         logger.error("Error caught on signal handler: %(receiver)s",
                      {'receiver': recv},
                      exc_info=failure_to_exc_info(failure),
                      extra={'spider': spider})
     return failure
Example #3
0
File: media.py Project: 01-/scrapy
    def _process_request(self, request, info):
        fp = request_fingerprint(request)
        cb = request.callback or (lambda _: _)
        eb = request.errback
        request.callback = None
        request.errback = None

        # Return cached result if request was already seen
        if fp in info.downloaded:
            return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)

        # Otherwise, wait for result
        wad = Deferred().addCallbacks(cb, eb)
        info.waiting[fp].append(wad)

        # Check if request is downloading right now to avoid doing it twice
        if fp in info.downloading:
            return wad

        # Download request checking media_to_download hook output first
        info.downloading.add(fp)
        dfd = mustbe_deferred(self.media_to_download, request, info)
        dfd.addCallback(self._check_media_to_download, request, info)
        dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
        dfd.addErrback(lambda f: logger.error(
            f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider})
        )
        return dfd.addBoth(lambda _: wad)  # it must return wad at last
Example #4
0
    def test_failure(self):
        try:
            0 / 0
        except ZeroDivisionError:
            exc_info = sys.exc_info()
            failure = Failure()

        self.assertTupleEqual(exc_info, failure_to_exc_info(failure))
Example #5
0
def err(_stuff=None, _why=None, **kw):
    warnings.warn('log.err has been deprecated, create a python logger and '
                  'use its error method instead',
                  ScrapyDeprecationWarning, stacklevel=2)

    level = kw.pop('level', logging.ERROR)
    failure = kw.pop('failure', _stuff) or Failure()
    message = kw.pop('why', _why) or failure.value
    logger.log(level, message, *[kw] if kw else [], exc_info=failure_to_exc_info(failure))
Example #6
0
 def _logerror(self, failure, request, spider):
     if failure.type is not IgnoreRequest:
         logger.error(
             "Error downloading %(request)s: %(f_exception)s",
             {"request": request, "f_exception": failure.value},
             exc_info=failure_to_exc_info(failure),
             extra={"spider": spider},
         )
     return failure
Example #7
0
File: engine.py Project: 01-/scrapy
 def _next_request_from_scheduler(self, spider):
     slot = self.slot
     request = slot.scheduler.next_request()
     if not request:
         return
     d = self._download(request, spider)
     d.addBoth(self._handle_downloader_output, request, spider)
     d.addErrback(lambda f: logger.info('Error while handling downloader output',
                                        exc_info=failure_to_exc_info(f),
                                        extra={'spider': spider}))
     d.addBoth(lambda _: slot.remove_request(request))
     d.addErrback(lambda f: logger.info('Error while removing request from slot',
                                        exc_info=failure_to_exc_info(f),
                                        extra={'spider': spider}))
     d.addBoth(lambda _: slot.nextcall.schedule())
     d.addErrback(lambda f: logger.info('Error while scheduling new request',
                                        exc_info=failure_to_exc_info(f),
                                        extra={'spider': spider}))
     return d
Example #8
0
File: engine.py Project: 01-/scrapy
 def _handle_downloader_output(self, response, request, spider):
     assert isinstance(response, (Request, Response, Failure)), response
     # downloader middleware can return requests (for example, redirects)
     if isinstance(response, Request):
         self.crawl(response, spider)
         return
     # response is a Response or Failure
     d = self.scraper.enqueue_scrape(response, request, spider)
     d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
                                         exc_info=failure_to_exc_info(f),
                                         extra={'spider': spider}))
     return d
Example #9
0
 def item_completed(self, results, item, info):
     if self.LOG_FAILED_RESULTS:
         for ok, value in results:
             if not ok:
                 logger.error(
                     '%(class)s found errors processing %(item)s',
                     {'class': self.__class__.__name__, 'item': item},
                     exc_info=failure_to_exc_info(value),
                     extra={'spider': info.spider}
                 )
     item["files"] = [{"file_id": x['file_id'], "url": x['url']} for ok, x in results if ok]
     return item
Example #10
0
File: media.py Project: 01-/scrapy
 def item_completed(self, results, item, info):
     """Called per item when all media requests has been processed"""
     if self.LOG_FAILED_RESULTS:
         for ok, value in results:
             if not ok:
                 logger.error(
                     '%(class)s found errors processing %(item)s',
                     {'class': self.__class__.__name__, 'item': item},
                     exc_info=failure_to_exc_info(value),
                     extra={'spider': info.spider}
                 )
     return item
Example #11
0
 def item_completed(self, results, item, info):
     """Called per item when all media requests has been processed"""
     if self.LOG_FAILED_RESULTS:
         for ok, value in results:
             if not ok:
                 logger.error('%(class)s found errors processing %(item)s',
                              {
                                  'class': self.__class__.__name__,
                                  'item': item
                              },
                              exc_info=failure_to_exc_info(value),
                              extra={'spider': info.spider})
     return item
Example #12
0
    def _next_request_from_scheduler(self) -> Optional[Deferred]:
        assert self.slot is not None  # typing
        assert self.spider is not None  # typing

        request = self.slot.scheduler.next_request()
        if request is None:
            return None

        d = self._download(request, self.spider)
        d.addBoth(self._handle_downloader_output, request)
        d.addErrback(lambda f: logger.info('Error while handling downloader output',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': self.spider}))
        d.addBoth(lambda _: self.slot.remove_request(request))
        d.addErrback(lambda f: logger.info('Error while removing request from slot',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': self.spider}))
        d.addBoth(lambda _: self.slot.nextcall.schedule())
        d.addErrback(lambda f: logger.info('Error while scheduling new request',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': self.spider}))
        return d
Example #13
0
 def _handle_downloader_output(self, response, request, spider):
     assert isinstance(response, (Request, Response, Failure)), response
     # downloader middleware can return requests (for example, redirects)
     if isinstance(response, Request):
         self.crawl(response, spider)
         return
     # response is a Response or Failure
     d = self.scraper.enqueue_scrape(response, request, spider)
     d.addErrback(
         lambda f: logger.error('Error while enqueuing downloader output',
                                exc_info=failure_to_exc_info(f),
                                extra={'spider': spider}))
     return d
Example #14
0
 def _next_request_from_scheduler(self, spider):
     slot = self.slot
     request = slot.scheduler.next_request()
     if not request:
         return
     d = self._download(request, spider)
     d.addBoth(self._handle_downloader_output, request, spider)
     d.addErrback(
         lambda f: logger.info('Error while handling downloader output',
                               exc_info=failure_to_exc_info(f),
                               extra={'spider': spider}))
     d.addBoth(lambda _: slot.remove_request(request))
     d.addErrback(
         lambda f: logger.info('Error while removing request from slot',
                               exc_info=failure_to_exc_info(f),
                               extra={'spider': spider}))
     d.addBoth(lambda _: slot.nextcall.schedule())
     d.addErrback(
         lambda f: logger.info('Error while scheduling new request',
                               exc_info=failure_to_exc_info(f),
                               extra={'spider': spider}))
     return d
Example #15
0
 def item_completed(self, results, item, info):
     for ok, value in results:
         if ok:
             item['isload'] = '下载成功'
             item['file_path'] = value['path']
         else:
             item['isload'] = '下载失败'
             logger.error('%(class)s found errors processing %(item)s', {
                 'class': self.__class__.__name__,
                 'item': item
             },
                          exc_info=failure_to_exc_info(value),
                          extra={'spider': info.spider})
     return item
Example #16
0
def err(_stuff=None, _why=None, **kw):
    warnings.warn(
        'log.err has been deprecated, create a python logger and '
        'use its error method instead',
        ScrapyDeprecationWarning,
        stacklevel=2)

    level = kw.pop('level', logging.ERROR)
    failure = kw.pop('failure', _stuff) or Failure()
    message = kw.pop('why', _why) or failure.value
    logger.log(level,
               message,
               *[kw] if kw else [],
               exc_info=failure_to_exc_info(failure))
Example #17
0
 def _next_request_from_scheduler(self, spider):
     slot = self.slot
     request = slot.scheduler.next_request()  # 从scheduler拿出下个request
     if not request:
         return
     d = self._download(
         request, spider
     )  # 处理各中间件的process_request方法,然后用self._enqueue_request将request发出下载,并绑定回调函数(如process_response)
     d.addBoth(self._handle_downloader_output, request, spider)
     d.addErrback(
         lambda f: logger.info('Error while handling downloader output',
                               exc_info=failure_to_exc_info(f),
                               extra={'spider': spider}))
     d.addBoth(lambda _: slot.remove_request(request))
     d.addErrback(
         lambda f: logger.info('Error while removing request from slot',
                               exc_info=failure_to_exc_info(f),
                               extra={'spider': spider}))
     d.addBoth(lambda _: slot.nextcall.schedule())
     d.addErrback(
         lambda f: logger.info('Error while scheduling new request',
                               exc_info=failure_to_exc_info(f),
                               extra={'spider': spider}))
     return d
 def _handle_downloader_output(self, response, request, spider):
     if not isinstance(response, (Request, Response, Failure)):
         raise TypeError(
             "Incorrect type: expected Request, Response or Failure, got "
             f"{type(response)}: {response!r}")
     # downloader middleware can return requests (for example, redirects)
     if isinstance(response, Request):
         self.crawl(response, spider)
         return
     # response is a Response or Failure
     d = self.scraper.enqueue_scrape(response, request, spider)
     d.addErrback(
         lambda f: logger.error('Error while enqueuing downloader output',
                                exc_info=failure_to_exc_info(f),
                                extra={'spider': spider}))
     return d
Example #19
0
 def _handle_downloader_output(self, response, request, spider):
     ## 下载结果 response 必须是 Request、Response、Failure 之一
     assert isinstance(response, (Request, Response, Failure)), response
     # downloader middleware can return requests (for example, redirects)
     ## 如果下载结果是 Request,则再次调用 crawl,执行 Scheduler 的入队逻辑
     if isinstance(response, Request):
         self.crawl(response, spider)
         return
     # response is a Response or Failure
     ## 如果下载结果是 Response 或 Failure,则交给 scrapy 的 enqueue_scrape 方法进一步处理
     ## 主要是与 spiders 和 pipelines 交互
     d = self.scraper.enqueue_scrape(response, request, spider)
     d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
                                         exc_info=failure_to_exc_info(f),
                                         extra={'spider': spider}))
     return d
Example #20
0
 def _handle_downloader_output(self, response, request,
                               spider):  # 这里链接到download下载后的response
     assert isinstance(response, (Request, Response, Failure)), response
     # downloader middleware can return requests (for example, redirects)
     if isinstance(response, Request):  # 对于结果,如果是Request,则直接入队,进入self.crawl
         self.crawl(response, spider)  # 对request请求指纹过滤,没问题则入队,然后递归心跳处理
         return
     # response is a Response or Failure
     d = self.scraper.enqueue_scrape(
         response, request, spider
     )  # 如果是正确的response,对下载器输出的结果进行scraper的三个处理函数,如果结果是request继续入队,如果是字典或者Item则调用process_item函数进行后续处理
     d.addErrback(
         lambda f: logger.error('Error while enqueuing downloader output',
                                exc_info=failure_to_exc_info(f),
                                extra={'spider': spider}))
     return d
Example #21
0
 def item_completed(self, results, item, info):
     if self.LOG_FAILED_RESULTS:
         for ok, value in results:
             if not ok:
                 logger.error('%(class)s found errors processing %(item)s',
                              {
                                  'class': self.__class__.__name__,
                                  'item': item
                              },
                              exc_info=failure_to_exc_info(value),
                              extra={'spider': info.spider})
     item["files"] = [{
         "file_id": x['file_id'],
         "url": x['url']
     } for ok, x in results if ok]
     return item
Example #22
0
 def close_spider(self, spider):
     slot = self.slot
     if not slot.itemcount and not self.store_empty:
         return
     slot.exporter.finish_exporting()
     logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s"
     log_args = {'format': self.format,
                 'itemcount': slot.itemcount,
                 'uri': slot.uri}
     d = defer.maybeDeferred(slot.storage.store, slot.file)
     d.addCallback(lambda _: logger.info(logfmt % "Stored", log_args,
                                         extra={'spider': spider}))
     d.addErrback(lambda f: logger.error(logfmt % "Error storing", log_args,
                                         exc_info=failure_to_exc_info(f),
                                         extra={'spider': spider}))
     return d
Example #23
0
 def enqueue_scrape(self, response, request, spider):
     slot = self.slot
     dfd = slot.add_response_request(response, request)
     def finish_scraping(_):
         slot.finish_response(response, request)
         self._check_if_closing(spider, slot)
         self._scrape_next(spider, slot)
         return _
     dfd.addBoth(finish_scraping)
     dfd.addErrback(
         lambda f: logger.error('Scraper bug processing %(request)s',
                                {'request': request},
                                exc_info=failure_to_exc_info(f),
                                extra={'spider': spider}))
     self._scrape_next(spider, slot)
     return dfd
Example #24
0
    def _handle_downloader_output(self, response, request, spider):
        # 下载结果必须为下面三者之一: Request/Response/Failure
        assert isinstance(response, (Request, Response, Failure)), response

        # downloader middleware can return requests (for example, redirects)
        # 结果 1: Request, 则必须重新进行一轮下载操作
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        # 结果 2: 利用scraper完成同spiders/pipeline交互, 见scrapy/core/scraper.py
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(
            lambda f: logger.error('Error while enqueuing downloader output',
                                   exc_info=failure_to_exc_info(f),
                                   extra={'spider': spider}))
        return d
Example #25
0
 def _handle_downloader_output(self, response, request, spider):
     assert isinstance(
         response, (Request, Response,
                    Failure)), response  # 下载结果必须是Request、Response、Failure其一
     # downloader middleware can return requests (for example, redirects)
     if isinstance(response,
                   Request):  # 如果是Request,则再次调用crawl,执行Scheduler的入队逻辑
         self.crawl(response, spider)
         return
     # response is a Response or Failure
     d = self.scraper.enqueue_scrape(response, request,
                                     spider)  # 主要是和Spiders和Pipeline交互
     d.addErrback(
         lambda f: logger.error('Error while enqueuing downloader output',
                                exc_info=failure_to_exc_info(f),
                                extra={'spider': spider}))
     return d
Example #26
0
 def handle_spider_error(self, _failure, request, response, spider):
     exc = _failure.value
     if isinstance(exc, CloseSpider):
         self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
         return
     logkws = self.logformatter.spider_error(_failure, request, response,
                                             spider)
     logger.log(*logformatter_adapter(logkws),
                exc_info=failure_to_exc_info(_failure),
                extra={'spider': spider})
     self.signals.send_catch_log(signal=signals.spider_error,
                                 failure=_failure,
                                 response=response,
                                 spider=spider)
     self.crawler.stats.inc_value("spider_exceptions/%s" %
                                  _failure.value.__class__.__name__,
                                  spider=spider)
Example #27
0
    def process_pokemon_request(self, request, info):
        fingerprint = request_fingerprint(request)
        callback = request.callback or (lambda _: _)
        errorback = request.errback
        request.callback = None
        request.errback = None

        # Otherwise, wait for result
        wad = Deferred().addCallbacks(callback, errorback)
        info.waiting[fingerprint].append(wad)

        info.downloading.add(fingerprint)
        dfd = mustbe_deferred(self.media_to_download, request, info)
        dfd.addCallback(self._check_media_to_download, request, info)
        dfd.addBoth(self._cache_result_and_execute_waiters, fingerprint, info)
        dfd.addErrback(lambda f: logger.error(f.value,
                                              exc_info=failure_to_exc_info(f),
                                              extra={'spider': info.spider}))
        return dfd.addBoth(lambda _: wad)  # it must return wad at last
Example #28
0
 def _itemproc_finished(self, output, item, response, spider):
     """ItemProcessor finished for the given ``item`` and returned ``output``"""
     self.slot.itemproc_size -= 1
     if isinstance(output, Failure):
         ex = output.value
         if isinstance(ex, DropItem):
             logkws = self.logformatter.dropped(item, ex, response, spider)
             if logkws is not None:
                 logger.log(*logformatter_adapter(logkws),
                            extra={"spider": spider})
             return self.signals.send_catch_log_deferred(
                 signal=signals.item_dropped,
                 item=item,
                 response=response,
                 spider=spider,
                 exception=output.value,
             )
         else:
             logkws = self.logformatter.item_error(item, ex, response,
                                                   spider)
             logger.log(
                 *logformatter_adapter(logkws),
                 extra={"spider": spider},
                 exc_info=failure_to_exc_info(output),
             )
             return self.signals.send_catch_log_deferred(
                 signal=signals.item_error,
                 item=item,
                 response=response,
                 spider=spider,
                 failure=output,
             )
     else:
         logkws = self.logformatter.scraped(output, response, spider)
         if logkws is not None:
             logger.log(*logformatter_adapter(logkws),
                        extra={"spider": spider})
         return self.signals.send_catch_log_deferred(
             signal=signals.item_scraped,
             item=output,
             response=response,
             spider=spider,
         )
Example #29
0
    def media_to_download(self, request, info):
        def _onsuccess(result):
            if not result:
                return  # returning None force download

            last_modified = result.get('last_modified', None)
            if not last_modified:
                return  # returning None force download

            age_seconds = time.time() - last_modified
            age_days = age_seconds / 60 / 60 / 24
            if age_days > self.expires:
                return  # returning None force download

            referer = referer_str(request)
            logger.debug(
                'File (uptodate): Downloaded %(medianame)s from %(request)s '
                'referred in <%(referer)s>', {
                    'medianame': self.MEDIA_NAME,
                    'request': request,
                    'referer': referer
                },
                extra={'spider': info.spider})
            self.inc_stats(info.spider, 'uptodate')

            checksum = result.get('checksum', None)

            return {
                'checksum': checksum,
                # TODO: Refactor this!
                'image_fields': info.urls_fields[request.url],
                'url': request.url,
                'path': '%s%s' % (self.image_base_url, path),
            }

        path = self.file_path(request, info=info)
        dfd = defer.maybeDeferred(self.store.stat_file, path, info)
        dfd.addCallbacks(_onsuccess, lambda _: None)
        dfd.addErrback(lambda f: logger.error(self.__class__.__name__ +
                                              '.store.stat_file',
                                              exc_info=failure_to_exc_info(f),
                                              extra={'spider': info.spider}))
        return dfd
Example #30
0
    def enqueue_scrape(self, result: Union[Response, Failure], request: Request, spider: Spider) -> Deferred:
        if self.slot is None:
            raise RuntimeError("Scraper slot not assigned")
        dfd = self.slot.add_response_request(result, request)

        def finish_scraping(_):
            self.slot.finish_response(result, request)
            self._check_if_closing(spider)
            self._scrape_next(spider)
            return _

        dfd.addBoth(finish_scraping)
        dfd.addErrback(
            lambda f: logger.error('Scraper bug processing %(request)s',
                                   {'request': request},
                                   exc_info=failure_to_exc_info(f),
                                   extra={'spider': spider}))
        self._scrape_next(spider)
        return dfd
Example #31
0
 def handle_spider_error(self, _failure, request, response, spider):
     exc = _failure.value
     if isinstance(exc, CloseSpider):
         self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
         return
     logger.error(
         "Spider error processing %(request)s (referer: %(referer)s)", {
             'request': request,
             'referer': referer_str(request)
         },
         exc_info=failure_to_exc_info(_failure),
         extra={'spider': spider})
     self.signals.send_catch_log(signal=signals.spider_error,
                                 failure=_failure,
                                 response=response,
                                 spider=spider)
     self.crawler.stats.inc_value("spider_exceptions/%s" %
                                  _failure.value.__class__.__name__,
                                  spider=spider)
Example #32
0
    def _log_download_errors(self, spider_failure, download_failure, request, spider):
        """Log and silence errors that come from the engine (typically download
        errors that got propagated thru here)
        """
        if (isinstance(download_failure, Failure) and
                not download_failure.check(IgnoreRequest)):
            if download_failure.frames:
                logger.error('Error downloading %(request)s',
                             {'request': request},
                             exc_info=failure_to_exc_info(download_failure),
                             extra={'spider': spider})
            else:
                errmsg = download_failure.getErrorMessage()
                if errmsg:
                    logger.error('Error downloading %(request)s: %(errmsg)s',
                                 {'request': request, 'errmsg': errmsg},
                                 extra={'spider': spider})

        if spider_failure is not download_failure:
            return spider_failure
Example #33
0
 def handle_spider_error(self, _failure, request, response, spider):
     exc = _failure.value
     if isinstance(exc, CloseSpider):
         self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
         return
     logger.error(
         "Spider error processing %(request)s (referer: %(referer)s)",
         {'request': request, 'referer': referer_str(request)},
         exc_info=failure_to_exc_info(_failure),
         extra={'spider': spider}
     )
     self.signals.send_catch_log(
         signal=signals.spider_error,
         failure=_failure, response=response,
         spider=spider
     )
     self.crawler.stats.inc_value(
         "spider_exceptions/%s" % _failure.value.__class__.__name__,
         spider=spider
     )
Example #34
0
    def enqueue_scrape(
        self, response, request, spider
    ):  # 执行process_spider_input、process_spider_exception、process_spider_output三个函数,然后还执行了process_item的管道处理函数
        slot = self.slot
        dfd = slot.add_response_request(response, request)  # 把数据推到self.queue里面

        def finish_scraping(_):
            slot.finish_response(response, request)
            self._check_if_closing(spider, slot)
            self._scrape_next(spider, slot)
            return _

        dfd.addBoth(finish_scraping)
        dfd.addErrback(lambda f: logger.error(
            'Scraper bug processing %(request)s', {'request': request},
            exc_info=failure_to_exc_info(f),
            extra={'spider': spider}))
        self._scrape_next(
            spider, slot)  # 主要是为了执行scrape_response,也就是scrape里面对response处理的函数
        return dfd
Example #35
0
    def media_to_download(self, request, info):
        path = self.file_path(request, info=info)

        def _onsuccess(result):
            if not result:
                return  # returning None force download

            last_modified = result.get('last_modified', None)
            if not last_modified:
                return  # returning None force download

            age_seconds = time.time() - last_modified
            age_days = age_seconds / 60 / 60 / 24
            if age_days > self.EXPIRES:
                return  # returning None force download

            referer = request.headers.get('Referer')
            logger.debug(
                'File (uptodate): Downloaded %(medianame)s from %(request)s '
                'referred in <%(referer)s>', {
                    'medianame': self.MEDIA_NAME,
                    'request': request,
                    'referer': referer
                },
                extra={'spider': info.spider})
            self.inc_stats(info.spider, 'uptodate')

            checksum = result.get('checksum', None)
            # ret = {'url': request.url, 'path': path, 'checksum': checksum}
            # filename = result.get('filename', None)
            # if filename:
            #     ret['filename'] = filename
            return {'url': request.url, 'path': path, 'checksum': checksum}

        dfd = defer.maybeDeferred(self.store.stat_file, path, info)
        dfd.addCallbacks(_onsuccess, lambda _: None)
        dfd.addErrback(lambda f: logger.error(self.__class__.__name__ +
                                              '.store.stat_file',
                                              exc_info=failure_to_exc_info(f),
                                              extra={'spider': info.spider}))
        return dfd
Example #36
0
File: files.py Project: orian/umo
    def media_to_download(self, request, info):
        path = self.file_path(request, info=info)
        def _onsuccess(result):
            if not result:
                return  # returning None force download

            last_modified = result.get('last_modified', None)
            if not last_modified:
                return  # returning None force download

            age_seconds = time.time() - last_modified
            age_days = age_seconds / 60 / 60 / 24
            if age_days > self.EXPIRES:
                return  # returning None force download

            referer = request.headers.get('Referer')
            logger.debug(
                'File (uptodate): Downloaded %(medianame)s from %(request)s '
                'referred in <%(referer)s>',
                {'medianame': self.MEDIA_NAME, 'request': request,
                 'referer': referer},
                extra={'spider': info.spider}
            )
            self.inc_stats(info.spider, 'uptodate')

            checksum = result.get('checksum', None)
            # ret = {'url': request.url, 'path': path, 'checksum': checksum}
            # filename = result.get('filename', None)
            # if filename:
            #     ret['filename'] = filename
            return {'url': request.url, 'path': path, 'checksum': checksum}

        dfd = defer.maybeDeferred(self.store.stat_file, path, info)
        dfd.addCallbacks(_onsuccess, lambda _: None)
        dfd.addErrback(
            lambda f:
            logger.error(self.__class__.__name__ + '.store.stat_file',
                         exc_info=failure_to_exc_info(f),
                         extra={'spider': info.spider})
        )
        return dfd
Example #37
0
 def _itemproc_finished(self, output, item, response, spider):
     """ItemProcessor finished for the given ``item`` and returned ``output``
     """
     self.slot.itemproc_size -= 1
     if isinstance(output, Failure):
         ex = output.value
         if isinstance(ex, DropItem):
             logkws = self.logformatter.dropped(item, ex, response, spider)
             logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
             return self.signals.send_catch_log_deferred(
                 signal=signals.item_dropped, item=item, response=response,
                 spider=spider, exception=output.value)
         else:
             logger.error('Error processing %(item)s', {'item': item},
                          exc_info=failure_to_exc_info(output),
                          extra={'spider': spider})
     else:
         logkws = self.logformatter.scraped(output, response, spider)
         logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
         return self.signals.send_catch_log_deferred(
             signal=signals.item_scraped, item=output, response=response,
             spider=spider)
Example #38
0
    def _handle_downloader_output(self, result: Union[Request, Response,
                                                      Failure],
                                  request: Request) -> Optional[Deferred]:
        assert self.spider is not None  # typing

        if not isinstance(result, (Request, Response, Failure)):
            raise TypeError(
                f"Incorrect type: expected Request, Response or Failure, got {type(result)}: {result!r}"
            )

        # downloader middleware can return requests (for example, redirects)
        if isinstance(result, Request):
            self.crawl(result)
            return None

        d = self.scraper.enqueue_scrape(result, request, self.spider)
        d.addErrback(lambda f: logger.error(
            "Error while enqueuing downloader output",
            exc_info=failure_to_exc_info(f),
            extra={'spider': self.spider},
        ))
        return d
Example #39
0
    def enqueue_scrape(self, response, request,
                       spider):  # 这里的response 实际上是一个deferred对象
        slot = self.slot
        dfd = slot.add_response_request(response, request)  # 将数据压入缓存

        def finish_scraping(_):
            slot.finish_response(response, request)  #从slot 移出这个结果
            self._check_if_closing(spider, slot)  # 检查自身是否处于正在关闭状态
            self._scrape_next(
                spider,
                slot)  #注册当这个respond 处理完后 处理下一个 (仅仅是在deferred处理链路上注册这个工作)
            return _

        dfd.addBoth(finish_scraping)
        dfd.addErrback(lambda f: logger.error(
            'Scraper bug processing %(request)s', {'request': request},
            exc_info=failure_to_exc_info(f),
            extra={'spider': spider}))
        self._scrape_next(
            spider,
            slot)  ######### 这里才是调用slot执行工作 (注意是执行slot里的工作 并非对应的request )
        return dfd
Example #40
0
    def test_default_item_completed(self):
        item = dict(name='name')
        assert self.pipe.item_completed([], item, self.info) is item

        # Check that failures are logged by default
        fail = Failure(Exception())
        results = [(True, 1), (False, fail)]

        with LogCapture() as l:
            new_item = self.pipe.item_completed(results, item, self.info)

        assert new_item is item
        assert len(l.records) == 1
        record = l.records[0]
        assert record.levelname == 'ERROR'
        self.assertTupleEqual(record.exc_info, failure_to_exc_info(fail))

        # disable failure logging and check again
        self.pipe.LOG_FAILED_RESULTS = False
        with LogCapture() as l:
            new_item = self.pipe.item_completed(results, item, self.info)
        assert new_item is item
        assert len(l.records) == 0
Example #41
0
    def test_default_item_completed(self):
        item = dict(name='name')
        assert self.pipe.item_completed([], item, self.info) is item

        # Check that failures are logged by default
        fail = Failure(Exception())
        results = [(True, 1), (False, fail)]

        with LogCapture() as l:
            new_item = self.pipe.item_completed(results, item, self.info)

        assert new_item is item
        assert len(l.records) == 1
        record = l.records[0]
        assert record.levelname == 'ERROR'
        self.assertTupleEqual(record.exc_info, failure_to_exc_info(fail))

        # disable failure logging and check again
        self.pipe.LOG_FAILED_RESULTS = False
        with LogCapture() as l:
            new_item = self.pipe.item_completed(results, item, self.info)
        assert new_item is item
        assert len(l.records) == 0
Example #42
0
 def _itemproc_finished(self, output, item, response, spider):
     """ItemProcessor finished for the given ``item`` and returned ``output``
     """
     self.slot.itemproc_size -= 1
     if isinstance(output, Failure):
         ex = output.value
         ## 如果在 pipeline 处理中抛 DropItem 异常,则忽略处理结果
         ## 从这里可以看到,如果想在 Pipeline 中丢弃某个结果,直接抛出 DropItem 异常即可
         ## scrapy 会进行相应的处理
         if isinstance(ex, DropItem):
             logkws = self.logformatter.dropped(item, ex, response, spider)
             logger.log(*logformatter_adapter(logkws),
                        extra={'spider': spider})
             return self.signals.send_catch_log_deferred(
                 signal=signals.item_dropped,
                 item=item,
                 response=response,
                 spider=spider,
                 exception=output.value)
         else:
             logger.error('Error processing %(item)s', {'item': item},
                          exc_info=failure_to_exc_info(output),
                          extra={'spider': spider})
             return self.signals.send_catch_log_deferred(
                 signal=signals.item_error,
                 item=item,
                 response=response,
                 spider=spider,
                 failure=output)
     else:
         logkws = self.logformatter.scraped(output, response, spider)
         logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
         return self.signals.send_catch_log_deferred(
             signal=signals.item_scraped,
             item=output,
             response=response,
             spider=spider)
Example #43
0
 def close_spider(self, spider):
     deferred_list = []
     for slot in self.slots:
         if not slot.itemcount and not slot.store_empty:
             # We need to call slot.storage.store nonetheless to get the file
             # properly closed.
             return defer.maybeDeferred(slot.storage.store, slot.file)
         slot.finish_exporting()
         logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s"
         log_args = {
             'format': slot.format,
             'itemcount': slot.itemcount,
             'uri': slot.uri
         }
         d = defer.maybeDeferred(slot.storage.store, slot.file)
         d.addCallback(lambda _: logger.info(
             logfmt % "Stored", log_args, extra={'spider': spider}))
         d.addErrback(
             lambda f: logger.error(logfmt % "Error storing",
                                    log_args,
                                    exc_info=failure_to_exc_info(f),
                                    extra={'spider': spider}))
         deferred_list.append(d)
     return defer.DeferredList(deferred_list) if deferred_list else None
Example #44
0
    def _close_slot(self, slot, spider):
        if not slot.itemcount and not slot.store_empty:
            # We need to call slot.storage.store nonetheless to get the file
            # properly closed.
            return defer.maybeDeferred(slot.storage.store, slot.file)
        slot.finish_exporting()
        logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s"
        log_args = {
            'format': slot.format,
            'itemcount': slot.itemcount,
            'uri': slot.uri
        }
        d = defer.maybeDeferred(slot.storage.store, slot.file)

        # Use `largs=log_args` to copy log_args into function's scope
        # instead of using `log_args` from the outer scope
        d.addCallback(lambda _, largs=log_args: logger.info(
            logfmt % "Stored", largs, extra={'spider': spider}))
        d.addErrback(lambda f, largs=log_args: logger.error(
            logfmt % "Error storing",
            largs,
            exc_info=failure_to_exc_info(f),
            extra={'spider': spider}))
        return d
Example #45
0
File: engine.py Project: 01-/scrapy
 def errback(failure):
     logger.error(
         msg,
         exc_info=failure_to_exc_info(failure),
         extra={'spider': spider}
     )
Example #46
0
 def test_non_failure(self):
     self.assertIsNone(failure_to_exc_info("test"))