コード例 #1
0
ファイル: media.py プロジェクト: yanjunli0624/scrapy
    def _cache_result_and_execute_waiters(self, result, fp, info):
        if isinstance(result, Failure):
            # minimize cached information for failure
            result.cleanFailure()
            result.frames = []
            result.stack = None

            # This code fixes a memory leak by avoiding to keep references to
            # the Request and Response objects on the Media Pipeline cache.
            #
            # Twisted inline callbacks pass return values using the function
            # twisted.internet.defer.returnValue, which encapsulates the return
            # value inside a _DefGen_Return base exception.
            #
            # What happens when the media_downloaded callback raises another
            # exception, for example a FileException('download-error') when
            # the Response status code is not 200 OK, is that it stores the
            # _DefGen_Return exception on the FileException context.
            #
            # To avoid keeping references to the Response and therefore Request
            # objects on the Media Pipeline cache, we should wipe the context of
            # the exception encapsulated by the Twisted Failure when its a
            # _DefGen_Return instance.
            #
            # This problem does not occur in Python 2.7 since we don't have
            # Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
            context = getattr(result.value, '__context__', None)
            if isinstance(context, _DefGen_Return):
                setattr(result.value, '__context__', None)

        info.downloading.remove(fp)
        info.downloaded[fp] = result  # cache result
        for wad in info.waiting.pop(fp):
            defer_result(result).chainDeferred(wad)
コード例 #2
0
ファイル: media.py プロジェクト: 01-/scrapy
 def _cache_result_and_execute_waiters(self, result, fp, info):
     if isinstance(result, Failure):
         # minimize cached information for failure
         result.cleanFailure()
         result.frames = []
         result.stack = None
     info.downloading.remove(fp)
     info.downloaded[fp] = result  # cache result
     for wad in info.waiting.pop(fp):
         defer_result(result).chainDeferred(wad)
コード例 #3
0
ファイル: media.py プロジェクト: xacprod/ve1
 def _cache_result_and_execute_waiters(self, result, fp, info):
     if isinstance(result, Failure):
         # minimize cached information for failure
         result.cleanFailure()
         result.frames = []
         result.stack = None
     info.downloading.remove(fp)
     info.downloaded[fp] = result  # cache result
     for wad in info.waiting.pop(fp):
         defer_result(result).chainDeferred(wad)
コード例 #4
0
ファイル: scraper.py プロジェクト: zjkanjie/scrapy
 def call_spider(self, result, request, spider):
     result.request = request
     dfd = defer_result(result)
     dfd.addCallbacks(callback=request.callback or spider.parse,
                      errback=request.errback,
                      callbackKeywords=request.cb_kwargs)
     return dfd.addCallback(iterate_spider_output)
コード例 #5
0
ファイル: media.py プロジェクト: sbe710/web-crawler
    def _process_request(self, request, info):
        fp = request_fingerprint(request)
        cb = request.callback or (lambda _: _)
        eb = request.errback
        request.callback = None
        request.errback = None

        # Return cached result if request was already seen
        if fp in info.downloaded:
            return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)

        # Otherwise, wait for result
        wad = Deferred().addCallbacks(cb, eb)
        info.waiting[fp].append(wad)

        # Check if request is downloading right now to avoid doing it twice
        if fp in info.downloading:
            return wad

        # Download request checking media_to_download hook output first
        info.downloading.add(fp)
        dfd = mustbe_deferred(self.media_to_download, request, info)
        dfd.addCallback(self._check_media_to_download, request, info)
        dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
        dfd.addErrback(lambda f: logger.error(
            f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider})
        )
        return dfd.addBoth(lambda _: wad)  # it must return wad at last
コード例 #6
0
ファイル: media.py プロジェクト: zhangcheng/scrapy
    def _process_request(self, request, info):
        fp = request_fingerprint(request)
        cb = request.callback or (lambda _: _)
        eb = request.errback
        request.callback = None
        request.errback = None

        # Return cached result if request was already seen
        if fp in info.downloaded:
            return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)

        # Otherwise, wait for result
        wad = Deferred().addCallbacks(cb, eb)
        info.waiting[fp].append(wad)

        # Check if request is downloading right now to avoid doing it twice
        if fp in info.downloading:
            return wad

        # Download request checking media_to_download hook output first
        info.downloading.add(fp)
        dfd = mustbe_deferred(self.media_to_download, request, info)
        dfd.addCallback(self._check_media_to_download, request, info)
        dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
        dfd.addErrback(log.err, spider=info.spider)
        return dfd.addBoth(lambda _: wad) # it must return wad at last
コード例 #7
0
    def call_spider(self, result, request, spider):
        ## 回调爬虫模块

        result.request = request
        dfd = defer_result(result)
        ## 注册回调,如果回调未定义则调用爬虫模块的 parse 方法
        dfd.addCallbacks(request.callback or spider.parse, request.errback)
        return dfd.addCallback(iterate_spider_output)
コード例 #8
0
 def call_spider(self, result, request, spider):
     result.request = request
     dfd = defer_result(result)
     callback = request.callback or spider.parse
     warn_on_generator_with_return_value(spider, callback)
     warn_on_generator_with_return_value(spider, request.errback)
     dfd.addCallbacks(callback=callback,
                      errback=request.errback,
                      callbackKeywords=request.cb_kwargs)
     return dfd.addCallback(iterate_spider_output)
コード例 #9
0
    def _deferred_value(self, value, spider):
        labels = self.labels.get(value)
        if labels is not None:
            self.logger.debug("found labels in cache for %s: %s", value, labels)
            return defer_result(labels)

        request = Request(self.url.format(value), priority=1)
        deferred = spider.crawler.engine.download(request, spider)
        deferred.addBoth(self._extract_labels, value)
        return deferred
コード例 #10
0
 def call_spider(self, result, request, spider):
     result.request = request
     dfd = defer_result(
         result
     )  # 把result(其实就是response)的处理安排到事件循环中,下个循环调用dfd的callback处理result
     # 首先是调用注册在request中的callback,如果没有就调用parse,所以不在request中指定回调时,就默认调用parse
     dfd.addCallbacks(callback=request.callback or spider.parse,
                      errback=request.errback,
                      callbackKeywords=request.cb_kwargs
                      )  # 看到没有,spider的parse方法在这里被添加到回调中,调用parse一般返回一个生成器
     return dfd.addCallback(iterate_spider_output)
コード例 #11
0
 def call_spider(self, result, request, spider):
     result.request = request
     dfd = defer_result(result)
     if request.callback:
         logger.info('Called request.callback %s', request.callback)
     else:
         logger.info('Called spider.parse')
     dfd.addCallbacks(callback=request.callback or spider.parse,
                      errback=request.errback,
                      callbackKeywords=request.cb_kwargs)
     return dfd.addCallback(iterate_spider_output)
コード例 #12
0
 def _deferred_field(self, field, item, spider):
     deferreds = [
         self._deferred_value(value, spider)
         for value in arg_to_iter(item.get(field))
     ]
     if not deferreds:
         item[field] = None
         return defer_result(item)
     deferred = DeferredList(deferreds, consumeErrors=True)
     deferred.addBoth(self._add_value, field, item)
     return deferred
コード例 #13
0
ファイル: media.py プロジェクト: serkanh/scrapy
        def _post_media_to_download(result):
            if result is None: # continue with download
                dwld = mustbe_deferred(self.download, request, info)
                dwld.addCallbacks(
                        callback=self.media_downloaded,
                        callbackArgs=(request, info),
                        errback=self.media_failed,
                        errbackArgs=(request, info))
            else: # or use media_to_download return value as result
                dwld = defer_result(result)

            info.downloading[fp] = (request, dwld) # fill downloading state data
            dwld.addBoth(_downloaded) # append post-download hook
            dwld.addErrback(log.err, spider=info.spider)
コード例 #14
0
ファイル: media.py プロジェクト: sbe710/web-crawler
    def _cache_result_and_execute_waiters(self, result, fp, info):
        if isinstance(result, Failure):
            # minimize cached information for failure
            result.cleanFailure()
            result.frames = []
            result.stack = None

            # This code fixes a memory leak by avoiding to keep references to
            # the Request and Response objects on the Media Pipeline cache.
            #
            # What happens when the media_downloaded callback raises an
            # exception, for example a FileException('download-error') when
            # the Response status code is not 200 OK, is that the original
            # StopIteration exception (which in turn contains the failed
            # Response and by extension, the original Request) gets encapsulated
            # within the FileException context.
            #
            # Originally, Scrapy was using twisted.internet.defer.returnValue
            # inside functions decorated with twisted.internet.defer.inlineCallbacks,
            # encapsulating the returned Response in a _DefGen_Return exception
            # instead of a StopIteration.
            #
            # To avoid keeping references to the Response and therefore Request
            # objects on the Media Pipeline cache, we should wipe the context of
            # the encapsulated exception when it is a StopIteration instance
            #
            # This problem does not occur in Python 2.7 since we don't have
            # Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
            context = getattr(result.value, '__context__', None)
            if isinstance(context, StopIteration):
                setattr(result.value, '__context__', None)

        info.downloading.remove(fp)
        info.downloaded[fp] = result  # cache result
        for wad in info.waiting.pop(fp):
            defer_result(result).chainDeferred(wad)
コード例 #15
0
ファイル: media.py プロジェクト: serkanh/scrapy
    def _enqueue(self, request, info):
        wad = request.deferred or Deferred()
        fp = request_fingerprint(request)

        # if already downloaded, return cached result.
        if fp in info.downloaded:
            return defer_result(info.downloaded[fp]).chainDeferred(wad)

        # add to pending list for this request, and wait for result like the others.
        info.waiting.setdefault(fp, []).append(wad)

        # if request is not downloading, download it.
        if fp not in info.downloading:
            self._download(request, info, fp)

        return wad
コード例 #16
0
ファイル: scraper.py プロジェクト: CzaOrz/sourceCodeLearning
 def call_spider(self, result, request,
                 spider):  # result其实就是下载下载完成后的response类
     result.request = request  # 执行完
     dfd = defer_result(result)
     # 这一步意义何在呢,感觉没有执行啊
     """
     入口url都是从starts_url开始,也就是说,下载器下载好后的response其实就是针对url而言,此时还没有使用回调函数呢
     
     这里的callback或者parse会不会是针对上一级来说的
     针对start_requests函数,我们将获取到的结果执行parse函数
     """
     dfd.addCallbacks(request.callback or spider.parse,
                      request.errback)  # 找到了,callback优先级高于parse,不写就默认为parse
     """ 好吧,我想多了,这里就是针对上一级的绑定,和我这一级是什么无关,除非我返回的是一个requests,那么下次才会走到这里来
     这里添加回调函数callback作为其后续处理
     就比如有五个首页入口,那爬虫会先把这个五个首页入口数据全记录,然后再进行翻页处理,怎么实现的呢
     """
     return dfd.addCallback(iterate_spider_output)
コード例 #17
0
ファイル: media.py プロジェクト: kenzouyeh/scrapy
    def _enqueue(self, request, info):
        fp = request_fingerprint(request)
        cb = request.callback or (lambda _: _)
        eb = request.errback

        # if already downloaded, return cached result.
        if fp in info.downloaded:
            return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)

        wad = Deferred().addCallbacks(cb, eb)
        # add to pending list for this request, and wait for result like the others.
        info.waiting.setdefault(fp, []).append(wad)

        # if request is not downloading, download it.
        if fp not in info.downloading:
            self._download(request, info, fp)

        return wad
コード例 #18
0
ファイル: scraper.py プロジェクト: herberthamaral/scrapy
 def call_spider(self, result, request, spider):
     dfd = defer_result(result)
     dfd.addCallbacks(request.callback or spider.parse, request.errback)
     return dfd.addCallback(iterate_spider_output)
コード例 #19
0
ファイル: spider.py プロジェクト: zanachka/scrapyext
 def call_spider(self, result, request, spider):
     result.request = request
     dfd = defer_result(result)  #                vvvvvvvvvvvv - patched
     dfd.addCallbacks(request.callback or spider.from_scraper,
                      request.errback)
     return dfd.addCallback(iterate_spider_output)
コード例 #20
0
ファイル: scraper.py プロジェクト: serkanh/scrapy
 def call_spider(self, result, request, spider):
     defer_result(result).chainDeferred(request.deferred)
     return request.deferred.addCallback(iterate_spider_output)
コード例 #21
0
ファイル: spider.py プロジェクト: nyov/scrapyext
 def call_spider(self, result, request, spider):
     result.request = request
     dfd = defer_result(result) #                vvvvvvvvvvvv - patched
     dfd.addCallbacks(request.callback or spider.from_scraper, request.errback)
     return dfd.addCallback(iterate_spider_output)
コード例 #22
0
 def call_spider(self, result, request, spider):
     result.request = request
     dfd = defer_result(result)
     dfd.addCallbacks(request.callback or spider.parse, request.errback)
     return dfd.addCallback(iterate_spider_output)
コード例 #23
0
ファイル: media.py プロジェクト: zhangcheng/scrapy
 def _cache_result_and_execute_waiters(self, result, fp, info):
     info.downloading.remove(fp)
     info.downloaded[fp] = result # cache result
     for wad in info.waiting.pop(fp):
         defer_result(result).chainDeferred(wad)
コード例 #24
0
 def _cache_result_and_execute_waiters(self, result, fp, info):
     info.downloading.remove(fp)
     info.downloaded[fp] = result  # cache result
     for wad in info.waiting.pop(fp):
         defer_result(result).chainDeferred(wad)
コード例 #25
0
ファイル: media.py プロジェクト: serkanh/scrapy
 def _downloaded(result):
     info.downloading.pop(fp)
     info.downloaded[fp] = result
     for wad in info.waiting.pop(fp): # pass result to each waiting client
         defer_result(result).chainDeferred(wad)