Example #1
0
 def _check_media_to_download(self, result, request, info):
     if result is not None:
         return result
     # Download request and process its response
     return mustbe_deferred(self.download, request, info).addCallbacks(
             callback=self.media_downloaded, callbackArgs=(request, info),
             errback=self.media_failed, errbackArgs=(request, info))
Example #2
0
    def _download(self, slot, request, spider):
        # The order is very important for the following deferreds. Do not change!

        # 1. Create the download deferred
        dfd = mustbe_deferred(self.handlers.download_request, request, spider)

        # 2. Notify response_downloaded listeners about the recent download
        # before querying queue for next request
        def _downloaded(response):
            self.signals.send_catch_log(signal=signals.response_downloaded,
                                        response=response,
                                        request=request,
                                        spider=spider)
            return response

        dfd.addCallback(_downloaded)

        # 3. After response arrives,  remove the request from transferring
        # state to free up the transferring slot so it can be used by the
        # following requests (perhaps those which came from the downloader
        # middleware itself)
        slot.transferring.add(request)

        def finish_transferring(_):
            slot.transferring.remove(request)
            self._process_queue(spider, slot)
            return _

        return dfd.addBoth(finish_transferring)
Example #3
0
    def _download(self, slot, request, spider):
        # The order is very important for the following deferreds. Do not change!

        # 1. Create the download deferred
        dfd = mustbe_deferred(self.handlers.download_request, request, spider)

        # 2. Notify response_downloaded listeners about the recent download
        # before querying queue for next request
        def _downloaded(response):
            self.signals.send_catch_log(
                signal=signals.response_downloaded, response=response, request=request, spider=spider
            )
            return response

        dfd.addCallback(_downloaded)

        # 3. After response arrives,  remove the request from transferring
        # state to free up the transferring slot so it can be used by the
        # following requests (perhaps those which came from the downloader
        # middleware itself)
        slot.transferring.add(request)

        def finish_transferring(_):
            slot.transferring.remove(request)
            self._process_queue(spider, slot)
            return _

        return dfd.addBoth(finish_transferring)
Example #4
0
    def _process_request(self, request, info):
        fp = request_fingerprint(request)
        cb = request.callback or (lambda _: _)
        eb = request.errback
        request.callback = None
        request.errback = None

        # Return cached result if request was already seen
        if fp in info.downloaded:
            return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)

        # Otherwise, wait for result
        wad = Deferred().addCallbacks(cb, eb)
        info.waiting[fp].append(wad)

        # Check if request is downloading right now to avoid doing it twice
        if fp in info.downloading:
            return wad

        # Download request checking media_to_download hook output first
        info.downloading.add(fp)
        dfd = mustbe_deferred(self.media_to_download, request, info)
        dfd.addCallback(self._check_media_to_download, request, info)
        dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
        dfd.addErrback(lambda f: logger.error(
            f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider})
        )
        return dfd.addBoth(lambda _: wad)  # it must return wad at last
Example #5
0
    def download(self, request, spider):
        def _on_success(response):
            """handle the result of a page download"""
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request  # tie request to response received
                log.msg(log.formatter.crawled(request, response, spider), \
                    level=log.DEBUG, spider=spider)
                return response
            elif isinstance(response, Request):
                return mustbe_deferred(self.schedule, response, spider)

        def _on_error(_failure):
            """handle an error processing a page"""
            exc = _failure.value
            if isinstance(exc, IgnoreRequest):
                errmsg = _failure.getErrorMessage()
            else:
                errmsg = str(_failure)
            if errmsg:
                log.msg("Error downloading <%s>: %s" % (request.url, errmsg), \
                    level=log.ERROR, spider=spider)
            return Failure(IgnoreRequest(str(exc)))

        def _on_complete(_):
            self.next_request(spider)
            return _

        if spider not in self.downloader.sites:
            return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete)

        dwld = mustbe_deferred(self.downloader.fetch, request, spider)
        dwld.addCallbacks(_on_success, _on_error)
        dwld.addBoth(_on_complete)
        return dwld
Example #6
0
    def _process_request(self, request, info):
        fp = request_fingerprint(request)
        cb = request.callback or (lambda _: _)
        eb = request.errback
        request.callback = None
        request.errback = None

        # Return cached result if request was already seen
        if fp in info.downloaded:
            return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)

        # Otherwise, wait for result
        wad = Deferred().addCallbacks(cb, eb)
        info.waiting[fp].append(wad)

        # Check if request is downloading right now to avoid doing it twice
        if fp in info.downloading:
            return wad

        # Download request checking media_to_download hook output first
        info.downloading.add(fp)
        dfd = mustbe_deferred(self.media_to_download, request, info)
        dfd.addCallback(self._check_media_to_download, request, info)
        dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
        dfd.addErrback(log.err, spider=info.spider)
        return dfd.addBoth(lambda _: wad) # it must return wad at last
Example #7
0
    def _download(self, slot, request, spider):
        # The order is very important for the following deferreds. Do not
        # change!

        # 1. Create the download deferred, 注册, 调用handlers的download_request, 发起真正下载
        # 见scrapy/core/downloader/handlers/__init__.py, 这里注册并调用download_request.
        # 其维护一个下载队列, 根据配置达到延时下载的目的
        dfd = mustbe_deferred(self.handlers.download_request, request, spider)

        # 2. Notify response_downloaded listeners about the recent download
        # before querying queue for next request
        # 注册回调方法:
        def _downloaded(response):
            self.signals.send_catch_log(signal=signals.response_downloaded,
                                        response=response,
                                        request=request,
                                        spider=spider)
            return response

        dfd.addCallback(_downloaded)

        # 3. After response arrives,  remove the request from transferring
        # state to free up the transferring slot so it can be used by the
        # following requests (perhaps those which came from the downloader
        # middleware itself)
        slot.transferring.add(request)  # 在返回响应之后, 移除request

        def finish_transferring(_):
            # 注册回调方法, response返回时
            slot.transferring.remove(request)
            # 下载完成后调用
            self._process_queue(spider, slot)
            return _

        return dfd.addBoth(finish_transferring)
Example #8
0
    def download(self, download_func, request,
                 spider):  # 这里的 download_func 实际就是downloader的_enqueue_request
        @defer.inlineCallbacks
        def process_request(request):
            for method in self.methods['process_request']:
                response = yield deferred_from_coro(
                    method(request=request, spider=spider)
                )  # 这个deferred_from_coro方法是将 middleware的方法 从 asyncio 转化为 recator方法 并yield出去
                if response is not None and not isinstance(
                        response, (Response, Request)):
                    raise _InvalidOutput(
                        f"Middleware {method.__self__.__class__.__name__}"
                        ".process_request must return None, Response or "
                        f"Request, got {response.__class__.__name__}")
                if response:
                    return response
            return (yield download_func(request=request, spider=spider))

        @defer.inlineCallbacks
        def process_response(response):
            if response is None:
                raise TypeError("Received None in process_response")
            elif isinstance(response, Request):
                return response

            for method in self.methods['process_response']:
                response = yield deferred_from_coro(
                    method(request=request, response=response, spider=spider))
                if not isinstance(response, (Response, Request)):
                    raise _InvalidOutput(
                        f"Middleware {method.__self__.__class__.__name__}"
                        ".process_response must return Response or Request, "
                        f"got {type(response)}")
                if isinstance(response, Request):
                    return response
            return response

        @defer.inlineCallbacks
        def process_exception(failure):
            exception = failure.value
            for method in self.methods['process_exception']:
                response = yield deferred_from_coro(
                    method(request=request, exception=exception,
                           spider=spider))
                if response is not None and not isinstance(
                        response, (Response, Request)):
                    raise _InvalidOutput(
                        f"Middleware {method.__self__.__class__.__name__}"
                        ".process_exception must return None, Response or "
                        f"Request, got {type(response)}")
                if response:
                    return response
            return failure

        deferred = mustbe_deferred(
            process_request,
            request)  #类似于maybedeffered 但是不在这轮次里 运行callback 和 errback
        deferred.addErrback(process_exception)
        deferred.addCallback(process_response)
        return deferred
Example #9
0
    def download(self, request, spider):
        def _on_success(response):
            """handle the result of a page download"""
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                log.msg(log.formatter.crawled(request, response, spider), \
                    level=log.DEBUG, spider=spider)
                return response
            elif isinstance(response, Request):
                return mustbe_deferred(self.schedule, response, spider)

        def _on_error(_failure):
            """handle an error processing a page"""
            exc = _failure.value
            if isinstance(exc, IgnoreRequest):
                errmsg = _failure.getErrorMessage()
            else:
                errmsg = str(_failure)
            if errmsg:
                log.msg("Error downloading <%s>: %s" % (request.url, errmsg), \
                    level=log.ERROR, spider=spider)
            return Failure(IgnoreRequest(str(exc)))

        def _on_complete(_):
            self.next_request(spider)
            return _

        if spider not in self.downloader.sites:
            return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete)

        dwld = mustbe_deferred(self.downloader.fetch, request, spider)
        dwld.addCallbacks(_on_success, _on_error)
        dwld.addBoth(_on_complete)
        return dwld
Example #10
0
    def _crawl(self, request):
        dfd = mustbe_deferred(self._handler.download_request, request, None)
        def downloaded(response, request):
            result = {"status" : 600, "doc" : None, "headers" : None}
            if isinstance(response, Response):
                response = self._redirect_middleware.process_response(request, response, None)
                if isinstance(response, Response):
                    result["url"] = response.url
                    result["status"] = response.status
                    result["doc"] = response.body
                    result["headers"] = response.headers
                    result["meta"] = request.meta
                elif isinstance(response, Request):
                    redirect_time = response.meta.get("redirect_time", 0)
                    redirect_time += 1
                    if redirect_time >= 2:
                        result["url"] = response.url
                        result["status"] = 601
                        result["meta"] = response.meta
                    else:
                        result["url"] = response.url
                        result["status"] = 700
                        result["meta"] = response.meta
                        result["redirect_time"] = redirect_time
                else:
                    raise Exception("not supported")
            else:
                result["url"] = request.url
                result["error_message"] = "crawler failed:%s" % response
                result["meta"] = request.meta

            return result

        return dfd.addBoth(downloaded, request)
Example #11
0
    def _download(self, request, info, fp):
        def _downloaded(result):
            info.downloading.pop(fp)
            info.downloaded[fp] = result
            for wad in info.waiting.pop(fp): # pass result to each waiting client
                defer_result(result).chainDeferred(wad)

        def _post_media_to_download(result):
            if result is None: # continue with download
                dwld = mustbe_deferred(self.download, request, info)
                dwld.addCallbacks(
                        callback=self.media_downloaded,
                        callbackArgs=(request, info),
                        errback=self.media_failed,
                        errbackArgs=(request, info))
            else: # or use media_to_download return value as result
                dwld = defer_result(result)

            info.downloading[fp] = (request, dwld) # fill downloading state data
            dwld.addBoth(_downloaded) # append post-download hook
            dwld.addErrback(log.err, spider=info.spider)

        # declare request in downloading state (None is used as place holder)
        info.downloading[fp] = None

        # defer pre-download request processing
        dfd = mustbe_deferred(self.media_to_download, request, info)
        dfd.addCallback(_post_media_to_download)
Example #12
0
    def download(self, download_func: Callable, request: Request,
                 spider: Spider):
        @defer.inlineCallbacks
        def process_request(request: Request):
            for method in self.methods['process_request']:
                method = cast(Callable, method)
                response = yield deferred_from_coro(
                    method(request=request, spider=spider))
                if response is not None and not isinstance(
                        response, (Response, Request)):
                    raise _InvalidOutput(
                        f"Middleware {method.__qualname__} must return None, Response or "
                        f"Request, got {response.__class__.__name__}")
                if response:
                    return response
            return (yield download_func(request=request, spider=spider))

        @defer.inlineCallbacks
        def process_response(response: Union[Response, Request]):
            if response is None:
                raise TypeError("Received None in process_response")
            elif isinstance(response, Request):
                return response

            for method in self.methods['process_response']:
                method = cast(Callable, method)
                response = yield deferred_from_coro(
                    method(request=request, response=response, spider=spider))
                if not isinstance(response, (Response, Request)):
                    raise _InvalidOutput(
                        f"Middleware {method.__qualname__} must return Response or Request, "
                        f"got {type(response)}")
                if isinstance(response, Request):
                    return response
            return response

        @defer.inlineCallbacks
        def process_exception(failure: Failure):
            exception = failure.value
            for method in self.methods['process_exception']:
                method = cast(Callable, method)
                response = yield deferred_from_coro(
                    method(request=request, exception=exception,
                           spider=spider))
                if response is not None and not isinstance(
                        response, (Response, Request)):
                    raise _InvalidOutput(
                        f"Middleware {method.__qualname__} must return None, Response or "
                        f"Request, got {type(response)}")
                if response:
                    return response
            return failure

        deferred = mustbe_deferred(process_request, request)
        deferred.addErrback(process_exception)
        deferred.addCallback(process_response)
        return deferred
Example #13
0
 def _next_request(self, spider):
     # Next pending request from scheduler
     request, deferred = self.scheduler.next_request(spider)
     if request:
         dwld = mustbe_deferred(self.download, request, spider)
         dwld.chainDeferred(deferred).addBoth(lambda _: deferred)
         dwld.addErrback(log.err, "Unhandled error on engine._next_request()",
             spider=spider)
         return dwld
Example #14
0
 def _check_media_to_download(self, result, request, info):
     if result is not None:
         return result
     # Download request and process its response
     return mustbe_deferred(self.download, request, info).addCallbacks(
         callback=self.media_downloaded,
         callbackArgs=(request, info),
         errback=self.media_failed,
         errbackArgs=(request, info))
Example #15
0
 def next_stage(item, stages_left):
     assert isinstance(item, BaseItem), \
         'Item pipelines must return a BaseItem, got %s' % type(item).__name__
     if not stages_left:
         return item
     current_stage = stages_left.pop(0)
     d = mustbe_deferred(current_stage.process_item, spider, item)
     d.addCallback(next_stage, stages_left)
     return d
    def download(self, download_func, request, spider):
        @defer.inlineCallbacks
        def process_request(request):
            for method in self.methods[
                    'process_request']:  # 挨个执行下载器中间件的process_request方法
                response = yield method(
                    request=request, spider=spider
                )  # 看,process_request方法就是在此处把requests和spider传入的
                if response is not None and not isinstance(
                        response, (Response, Request)):
                    raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \
                                         (six.get_method_self(method).__class__.__name__, response.__class__.__name__))
                if response:
                    defer.returnValue(response)
            defer.returnValue(
                (yield download_func(request=request, spider=spider)
                 ))  # 如果下载器中间件没有返回值,则执行注册进来的方法,也就是Downloader的_enqueue_request

        @defer.inlineCallbacks
        def process_response(response):
            assert response is not None, 'Received None in process_response'
            if isinstance(response, Request):
                defer.returnValue(response)

            for method in self.methods['process_response']:
                response = yield method(request=request,
                                        response=response,
                                        spider=spider)
                if not isinstance(response, (Response, Request)):
                    raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \
                                         (six.get_method_self(method).__class__.__name__, type(response)))
                if isinstance(response, Request):
                    defer.returnValue(response)
            defer.returnValue(response)

        @defer.inlineCallbacks
        def process_exception(_failure):
            exception = _failure.value
            for method in self.methods['process_exception']:
                response = yield method(request=request,
                                        exception=exception,
                                        spider=spider)
                if response is not None and not isinstance(
                        response, (Response, Request)):
                    raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \
                                         (six.get_method_self(method).__class__.__name__, type(response)))
                if response:
                    defer.returnValue(response)
            defer.returnValue(_failure)

        deferred = mustbe_deferred(
            process_request, request
        )  # 这里会马上调用process_request,处理完各个中间件的process_request方法,然后将request发出下载
        deferred.addErrback(process_exception)
        deferred.addCallback(process_response)
        return deferred
Example #17
0
 def _on_success(response):
     """handle the result of a page download"""
     assert isinstance(response, (Response, Request))
     if isinstance(response, Response):
         response.request = request  # tie request to response received
         log.msg(log.formatter.crawled(request, response, spider), \
             level=log.DEBUG, spider=spider)
         return response
     elif isinstance(response, Request):
         return mustbe_deferred(self.schedule, response, spider)
Example #18
0
 def _next_request(self, spider):
     # Next pending request from scheduler
     request, deferred = self.scheduler.next_request(spider)
     if request:
         dwld = mustbe_deferred(self.download, request, spider)
         dwld.chainDeferred(deferred).addBoth(lambda _: deferred)
         dwld.addErrback(log.err,
                         "Unhandled error on engine._next_request()",
                         spider=spider)
         return dwld
Example #19
0
    def test_success_function(self):
        steps = []
        def _append(v):
            steps.append(v)
            return steps

        dfd = mustbe_deferred(_append, 1)
        dfd.addCallback(self.assertEqual, [1, 2]) # it is [1] with maybeDeferred
        steps.append(2) # add another value, that should be catched by assertEqual
        return dfd
Example #20
0
    def test_success_function(self):
        steps = []
        def _append(v):
            steps.append(v)
            return steps

        dfd = mustbe_deferred(_append, 1)
        dfd.addCallback(self.assertEqual, [1, 2]) # it is [1] with maybeDeferred
        steps.append(2) # add another value, that should be catched by assertEqual
        return dfd
Example #21
0
 def _on_success(response):
     """handle the result of a page download"""
     assert isinstance(response, (Response, Request))
     if isinstance(response, Response):
         response.request = request # tie request to response received
         log.msg(log.formatter.crawled(request, response, spider), \
             level=log.DEBUG, spider=spider)
         return response
     elif isinstance(response, Request):
         return mustbe_deferred(self.schedule, response, spider)
Example #22
0
    def download(self, download_func, request, spider):
        @defer.inlineCallbacks
        def process_request(request):
            for method in self.methods['process_request']:
                response = yield deferred_from_coro(
                    method(request=request, spider=spider))
                if response is not None and not isinstance(
                        response, (Response, Request)):
                    raise _InvalidOutput(
                        "Middleware %s.process_request must return None, Response or Request, got %s"
                        % (method.__self__.__class__.__name__,
                           response.__class__.__name__))
                if response:
                    defer.returnValue(response)
            defer.returnValue((yield download_func(request=request,
                                                   spider=spider)))

        @defer.inlineCallbacks
        def process_response(response):
            assert response is not None, 'Received None in process_response'
            if isinstance(response, Request):
                defer.returnValue(response)

            for method in self.methods['process_response']:
                response = yield deferred_from_coro(
                    method(request=request, response=response, spider=spider))
                if not isinstance(response, (Response, Request)):
                    raise _InvalidOutput(
                        "Middleware %s.process_response must return Response or Request, got %s"
                        % (method.__self__.__class__.__name__, type(response)))
                if isinstance(response, Request):
                    defer.returnValue(response)
            defer.returnValue(response)

        @defer.inlineCallbacks
        def process_exception(_failure):
            exception = _failure.value
            for method in self.methods['process_exception']:
                response = yield deferred_from_coro(
                    method(request=request, exception=exception,
                           spider=spider))
                if response is not None and not isinstance(
                        response, (Response, Request)):
                    raise _InvalidOutput(
                        "Middleware %s.process_exception must return None, Response or Request, got %s"
                        % (method.__self__.__class__.__name__, type(response)))
                if response:
                    defer.returnValue(response)
            defer.returnValue(_failure)

        deferred = mustbe_deferred(process_request, request)
        deferred.addErrback(process_exception)
        deferred.addCallback(process_response)
        return deferred
Example #23
0
    def scrape_response(self, scrape_func, response, request, spider):
        def process_callback_output(result):
            return self._process_callback_output(response, spider, result)

        def process_spider_exception(_failure):
            return self._process_spider_exception(response, spider, _failure)

        dfd = mustbe_deferred(self._process_spider_input, scrape_func,
                              response, request, spider)
        dfd.addCallbacks(callback=process_callback_output,
                         errback=process_spider_exception)
        return dfd
Example #24
0
    def test_unfired_deferred(self):
        steps = []
        def _append(v):
            steps.append(v)
            dfd = defer.Deferred()
            reactor.callLater(0, dfd.callback, steps)
            return dfd

        dfd = mustbe_deferred(_append, 1)
        dfd.addCallback(self.assertEqual, [1, 2]) # it is [1] with maybeDeferred
        steps.append(2) # add another value, that should be catched by assertEqual
        return dfd
Example #25
0
    def test_unfired_deferred(self):
        steps = []
        def _append(v):
            steps.append(v)
            dfd = defer.Deferred()
            reactor.callLater(0, dfd.callback, steps)
            return dfd

        dfd = mustbe_deferred(_append, 1)
        dfd.addCallback(self.assertEqual, [1, 2]) # it is [1] with maybeDeferred
        steps.append(2) # add another value, that should be catched by assertEqual
        return dfd
Example #26
0
    def download(self, download_func, request, spider):
        @defer.inlineCallbacks
        def process_request(request):
            for method in self.methods['process_request']:
                response = yield method(request=request, spider=spider)
                assert response is None or isinstance(response, (Response, Request)), \
                        'Middleware %s.process_request must return None, Response or Request, got %s' % \
                        (six.get_method_self(method).__class__.__name__, response.__class__.__name__)
                if response:
                    defer.returnValue(response)
            try:
                defer.returnValue((yield download_func(request=request,
                                                       spider=spider)))
            except Exception as e:
                with open('eroor', 'a') as w:
                    w.write(request.url + '\n')

        @defer.inlineCallbacks
        def process_response(response):
            assert response is not None, 'Received None in process_response'
            if isinstance(response, Request):
                defer.returnValue(response)

            for method in self.methods['process_response']:
                response = yield method(request=request,
                                        response=response,
                                        spider=spider)
                assert isinstance(response, (Response, Request)), \
                    'Middleware %s.process_response must return Response or Request, got %s' % \
                    (six.get_method_self(method).__class__.__name__, type(response))
                if isinstance(response, Request):
                    defer.returnValue(response)
            defer.returnValue(response)

        @defer.inlineCallbacks
        def process_exception(_failure):
            exception = _failure.value
            for method in self.methods['process_exception']:
                response = yield method(request=request,
                                        exception=exception,
                                        spider=spider)
                assert response is None or isinstance(response, (Response, Request)), \
                    'Middleware %s.process_exception must return None, Response or Request, got %s' % \
                    (six.get_method_self(method).__class__.__name__, type(response))
                if response:
                    defer.returnValue(response)
            defer.returnValue(_failure)

        deferred = mustbe_deferred(process_request, request)
        deferred.addErrback(process_exception)
        deferred.addCallback(process_response)
        return deferred
Example #27
0
 def crawl(self, request, spider):
     assert spider in self.open_spiders, \
         "Spider %r not opened when crawling: %s" % (spider.name, request)
     if spider in self.closing: # ignore requests for spiders being closed
         return
     schd = mustbe_deferred(self.schedule, request, spider)
     # FIXME: we can't log errors because we would be preventing them from
     # propagating to the request errback. This should be fixed after the
     # next core refactoring.
     #schd.addErrback(log.err, "Error on engine.crawl()")
     schd.addBoth(self.scraper.enqueue_scrape, request, spider)
     schd.addErrback(log.err, "Unhandled error on engine.crawl()", spider=spider)
     schd.addBoth(lambda _: self.next_request(spider))
Example #28
0
 def _on_success(response):
     """handle the result of a page download"""
     assert isinstance(response, (Response, Request))
     if isinstance(response, Response):
         response.request = request # tie request to response received
         log.msg(self._crawled_logline(request, response), \
             level=log.DEBUG, spider=spider)
         return response
     elif isinstance(response, Request):
         newrequest = response
         schd = mustbe_deferred(self.schedule, newrequest, spider)
         schd.chainDeferred(newrequest.deferred)
         return newrequest.deferred
Example #29
0
 def crawl(self, request, spider):
     if not request.deferred.callbacks:
         log.msg("Unable to crawl Request with no callback: %s" % request,
             level=log.ERROR, spider=spider)
         return
     schd = mustbe_deferred(self.schedule, request, spider)
     # FIXME: we can't log errors because we would be preventing them from
     # propagating to the request errback. This should be fixed after the
     # next core refactoring.
     #schd.addErrback(log.err, "Error on engine.crawl()")
     schd.addBoth(self.scraper.enqueue_scrape, request, spider)
     schd.addErrback(log.err, "Unhandled error on engine.crawl()", spider=spider)
     schd.addBoth(lambda _: self.next_request(spider))
Example #30
0
    def enqueue_request(self, wrappedfunc, spider, request):
        def _enqueue_request(request):
            for mwfunc in self.methods['enqueue_request']:
                result = mwfunc(spider=spider, request=request)
                assert result is None or isinstance(result, Deferred), \
                        'Middleware %s.enqueue_request must return None or Deferred, got %s' % \
                        (mwfunc.im_self.__class__.__name__, result.__class__.__name__)
                if result:
                    return result
            return wrappedfunc(spider=spider, request=request)

        deferred = mustbe_deferred(_enqueue_request, request)
        return deferred
Example #31
0
    def enqueue_request(self, wrappedfunc, spider, request):
        def _enqueue_request(request):
            for mwfunc in self.methods['enqueue_request']:
                result = mwfunc(spider=spider, request=request)
                assert result is None or isinstance(result, Deferred), \
                        'Middleware %s.enqueue_request must return None or Deferred, got %s' % \
                        (mwfunc.im_self.__class__.__name__, result.__class__.__name__)
                if result:
                    return result
            return wrappedfunc(spider=spider, request=request)

        deferred = mustbe_deferred(_enqueue_request, request)
        return deferred
Example #32
0
    def scrape_response(self, scrape_func, response, request, spider):
        fname = lambda f: '%s.%s' % (six.get_method_self(f).__class__.__name__,
                                     six.get_method_function(f).__name__)

        def process_spider_input(response):
            ## 执行一系列爬虫中间件的 process_spider_input 方法
            ## 再执行 scrape_func 即 call_spider 方法

            for method in self.methods['process_spider_input']:
                try:
                    result = method(response=response, spider=spider)
                    assert result is None, \
                            'Middleware %s must returns None or ' \
                            'raise an exception, got %s ' \
                            % (fname(method), type(result))
                except:
                    return scrape_func(Failure(), request, spider)
            return scrape_func(response, request, spider)

        def process_spider_exception(_failure):
            ## 执行一系列爬虫中间件的 process_spider_exception 方法
            exception = _failure.value
            for method in self.methods['process_spider_exception']:
                result = method(response=response,
                                exception=exception,
                                spider=spider)
                assert result is None or _isiterable(result), \
                    'Middleware %s must returns None, or an iterable object, got %s ' % \
                    (fname(method), type(result))
                if result is not None:
                    return result
            return _failure

        def process_spider_output(result):
            ## 执行一系列爬虫中间件的 process_spider_output 方法
            for method in self.methods['process_spider_output']:
                result = method(response=response,
                                result=result,
                                spider=spider)
                assert _isiterable(result), \
                    'Middleware %s must returns an iterable object, got %s ' % \
                    (fname(method), type(result))
            return result

        ## 执行 process_spider_input 方法
        dfd = mustbe_deferred(process_spider_input, response)
        ## 注册异常回调
        dfd.addErrback(process_spider_exception)
        ## 注册出口回调
        dfd.addCallback(process_spider_output)
        return dfd
Example #33
0
        def _post_media_to_download(result):
            if result is None: # continue with download
                dwld = mustbe_deferred(self.download, request, info)
                dwld.addCallbacks(
                        callback=self.media_downloaded,
                        callbackArgs=(request, info),
                        errback=self.media_failed,
                        errbackArgs=(request, info))
            else: # or use media_to_download return value as result
                dwld = defer_result(result)

            info.downloading[fp] = (request, dwld) # fill downloading state data
            dwld.addBoth(_downloaded) # append post-download hook
            dwld.addErrback(log.err, spider=info.spider)
Example #34
0
    def scrape_response(self, scrape_func, response, request, spider):
        fname = lambda f: '%s.%s' % (six.get_method_self(f).__class__.__name__,
                                     six.get_method_function(f).__name__)
        """他会先执行完某一函数的所有回调情况,再往下走????whats???? 似乎每一个爬虫流程都会走一遍,whats
        执行process_spider_input
        执行call_spider
        执行process_spider_output
        """
        def process_spider_input(response):
            for method in self.methods['process_spider_input']:
                try:
                    result = method(response=response, spider=spider)
                    assert result is None, \
                            'Middleware %s must returns None or ' \
                            'raise an exception, got %s ' \
                            % (fname(method), type(result))
                except:
                    return scrape_func(Failure(), request, spider)
            return scrape_func(
                response, request,
                spider)  # 这里就是call_spider爬虫处理后的结果,是一个dfd,里面指定了下一级的回调函数

        def process_spider_exception(_failure):  # 错了才会执行这里
            exception = _failure.value
            for method in self.methods['process_spider_exception']:
                result = method(response=response,
                                exception=exception,
                                spider=spider)
                assert result is None or _isiterable(result), \
                    'Middleware %s must returns None, or an iterable object, got %s ' % \
                    (fname(method), type(result))
                if result is not None:
                    return result
            return _failure

        def process_spider_output(result):
            for method in self.methods['process_spider_output']:
                result = method(response=response,
                                result=result,
                                spider=spider)
                assert _isiterable(result), \
                    'Middleware %s must returns an iterable object, got %s ' % \
                    (fname(method), type(result))
            return result

        dfd = mustbe_deferred(process_spider_input,
                              response)  # 对进来的数据做一次处理???怎么还有这种操作
        dfd.addErrback(process_spider_exception)
        dfd.addCallback(process_spider_output)
        return dfd
Example #35
0
    def scrape_response(self, scrape_func: ScrapeFunc, response: Response,
                        request: Request, spider: Spider) -> Deferred:
        def process_callback_output(result: Iterable) -> MutableChain:
            return self._process_callback_output(response, spider, result)

        def process_spider_exception(
                _failure: Failure) -> Union[Failure, MutableChain]:
            return self._process_spider_exception(response, spider, _failure)

        dfd = mustbe_deferred(self._process_spider_input, scrape_func,
                              response, request, spider)
        dfd.addCallbacks(callback=process_callback_output,
                         errback=process_spider_exception)
        return dfd
Example #36
0
 def _on_success(response):
     """handle the result of a page download"""
     assert isinstance(response, (Response, Request))
     if isinstance(response, Response):
         response.request = request # tie request to response received
         log.msg(self._crawled_logline(request, response), \
             level=log.DEBUG, spider=spider)
         return response
     elif isinstance(response, Request):
         newrequest = response
         dfd = mustbe_deferred(self.schedule, newrequest, spider)
         if newrequest.callback:
             # XXX: this is a bit hacky and should be removed
             dfd.addCallbacks(newrequest.callback, newrequest.errback)
         return dfd
Example #37
0
 def crawl(self, request, spider):
     if spider in self.closing:  # ignore requests for spiders being closed
         return
     assert spider in self.open_spiders, \
         "Spider %r not opened when crawling: %s" % (spider.name, request)
     schd = mustbe_deferred(self.schedule, request, spider)
     # FIXME: we can't log errors because we would be preventing them from
     # propagating to the request errback. This should be fixed after the
     # next core refactoring.
     #schd.addErrback(log.err, "Error on engine.crawl()")
     schd.addBoth(self.scraper.enqueue_scrape, request, spider)
     schd.addErrback(log.err,
                     "Unhandled error on engine.crawl()",
                     spider=spider)
     schd.addBoth(lambda _: self.next_request(spider))
Example #38
0
    def download(self, download_func, request, spider):
        @defer.inlineCallbacks
        def process_request(request):
            for method in self.methods['process_request']:
                # splash request , dont process again by wsy
                if request.meta.get("_splash_processed"):
                    break
                response = yield method(request=request, spider=spider)
                assert response is None or isinstance(response, (Response, Request)), \
                        'Middleware %s.process_request must return None, Response or Request, got %s' % \
                        (six.get_method_self(method).__class__.__name__, response.__class__.__name__)
                if response:
                    defer.returnValue(response)
            defer.returnValue((yield download_func(request=request,spider=spider)))

        @defer.inlineCallbacks
        def process_response(response):
            assert response is not None, 'Received None in process_response'
            if isinstance(response, Request):
                defer.returnValue(response)

            for method in self.methods['process_response']:
                response = yield method(request=request, response=response,
                                        spider=spider)
                assert isinstance(response, (Response, Request)), \
                    'Middleware %s.process_response must return Response or Request, got %s' % \
                    (six.get_method_self(method).__class__.__name__, type(response))
                if isinstance(response, Request):
                    defer.returnValue(response)
            defer.returnValue(response)

        @defer.inlineCallbacks
        def process_exception(_failure):
            exception = _failure.value
            for method in self.methods['process_exception']:
                response = yield method(request=request, exception=exception,
                                        spider=spider)
                assert response is None or isinstance(response, (Response, Request)), \
                    'Middleware %s.process_exception must return None, Response or Request, got %s' % \
                    (six.get_method_self(method).__class__.__name__, type(response))
                if response:
                    defer.returnValue(response)
            defer.returnValue(_failure)

        deferred = mustbe_deferred(process_request, request)
        deferred.addErrback(process_exception)
        deferred.addCallback(process_response)
        return deferred
Example #39
0
File: media.py Project: 01-/scrapy
 def _check_media_to_download(self, result, request, info):
     if result is not None:
         return result
     if self.download_func:
         # this ugly code was left only to support tests. TODO: remove
         dfd = mustbe_deferred(self.download_func, request, info.spider)
         dfd.addCallbacks(
             callback=self.media_downloaded, callbackArgs=(request, info),
             errback=self.media_failed, errbackArgs=(request, info))
     else:
         request.meta['handle_httpstatus_all'] = True
         dfd = self.crawler.engine.download(request, info.spider)
         dfd.addCallbacks(
             callback=self.media_downloaded, callbackArgs=(request, info),
             errback=self.media_failed, errbackArgs=(request, info))
     return dfd
Example #40
0
 def _check_media_to_download(self, result, request, info):
     if result is not None:
         return result
     if self.download_func:
         # this ugly code was left only to support tests. TODO: remove
         dfd = mustbe_deferred(self.download_func, request, info.spider)
         dfd.addCallbacks(
             callback=self.media_downloaded, callbackArgs=(request, info),
             errback=self.media_failed, errbackArgs=(request, info))
     else:
         self._modify_media_request(request)
         dfd = self.crawler.engine.download(request, info.spider)
         dfd.addCallbacks(
             callback=self.media_downloaded, callbackArgs=(request, info),
             errback=self.media_failed, errbackArgs=(request, info))
     return dfd
Example #41
0
    def _download(self, site, request, spider):
        # The order is very important for the following deferreds. Do not change!

        # 1. Create the download deferred
        dfd = mustbe_deferred(self.handlers.download_request, request, spider)

        # 2. After response arrives,  remove the request from transferring
        # state to free up the transferring slot so it can be used by the
        # following requests (perhaps those which came from the downloader
        # middleware itself)
        site.transferring.add(request)
        def finish_transferring(_):
            site.transferring.remove(request)
            self._process_queue(spider)
            return _
        return dfd.addBoth(finish_transferring)
Example #42
0
    def download(self, download_func, request, spider):
        @defer.inlineCallbacks
        def process_request(request):
            for method in self.methods["process_request"]:
                response = yield method(request=request, spider=spider)
                assert response is None or isinstance(response, (Response, Request)), (
                    "Middleware %s.process_request must return None, Response or Request, got %s"
                    % (six.get_method_self(method).__class__.__name__, response.__class__.__name__)
                )
                if response:
                    defer.returnValue(response)
            defer.returnValue((yield download_func(request=request, spider=spider)))

        @defer.inlineCallbacks
        def process_response(response):
            assert response is not None, "Received None in process_response"
            if isinstance(response, Request):
                defer.returnValue(response)

            for method in self.methods["process_response"]:
                response = yield method(request=request, response=response, spider=spider)
                assert isinstance(response, (Response, Request)), (
                    "Middleware %s.process_response must return Response or Request, got %s"
                    % (six.get_method_self(method).__class__.__name__, type(response))
                )
                if isinstance(response, Request):
                    defer.returnValue(response)
            defer.returnValue(response)

        @defer.inlineCallbacks
        def process_exception(_failure):
            exception = _failure.value
            for method in self.methods["process_exception"]:
                response = yield method(request=request, exception=exception, spider=spider)
                assert response is None or isinstance(response, (Response, Request)), (
                    "Middleware %s.process_exception must return None, Response or Request, got %s"
                    % (six.get_method_self(method).__class__.__name__, type(response))
                )
                if response:
                    defer.returnValue(response)
            defer.returnValue(_failure)

        deferred = mustbe_deferred(process_request, request)
        deferred.addErrback(process_exception)
        deferred.addCallback(process_response)
        return deferred
Example #43
0
    def download(self, download_func, request, spider):
        @defer.inlineCallbacks
        def process_request(request):
            for method in self.methods['process_request']:
                response = yield method(request=request, spider=spider)
                assert response is None or isinstance(response, (Response, Request)), \
                        'Middleware %s.process_request must return None, Response or Request, got %s' % \
                        (six.get_method_self(method).__class__.__name__, response.__class__.__name__)  # 规定返回类型,None,response,request这三种
                if response:  # 如果是空,则会继续处理所有的中间件,但是一旦不是了,就会提前跑路,对,应该就是这个意思,可以很骚
                    defer.returnValue(response)
            defer.returnValue((yield download_func(request=request,spider=spider)))

        @defer.inlineCallbacks
        def process_response(response):
            assert response is not None, 'Received None in process_response'
            if isinstance(response, Request):
                defer.returnValue(response)

            for method in self.methods['process_response']:
                response = yield method(request=request, response=response,
                                        spider=spider)
                assert isinstance(response, (Response, Request)), \
                    'Middleware %s.process_response must return Response or Request, got %s' % \
                    (six.get_method_self(method).__class__.__name__, type(response))
                if isinstance(response, Request):
                    defer.returnValue(response)
            defer.returnValue(response)

        @defer.inlineCallbacks
        def process_exception(_failure):
            exception = _failure.value
            for method in self.methods['process_exception']:
                response = yield method(request=request, exception=exception,
                                        spider=spider)
                assert response is None or isinstance(response, (Response, Request)), \
                    'Middleware %s.process_exception must return None, Response or Request, got %s' % \
                    (six.get_method_self(method).__class__.__name__, type(response))
                if response:
                    defer.returnValue(response)
            defer.returnValue(_failure)

        deferred = mustbe_deferred(process_request, request)  # 执行顺序,现执行request,再执行exception,最后才是response
        deferred.addErrback(process_exception)
        deferred.addCallback(process_response)
        return deferred
Example #44
0
    def _download(self, slot, request, spider):
        # The order is very important for the following deferreds. Do not change!

        # 1. Create the download deferred
        dfd = mustbe_deferred(self.handlers.download_request, request, spider)

        # 2. After response arrives,  remove the request from transferring
        # state to free up the transferring slot so it can be used by the
        # following requests (perhaps those which came from the downloader
        # middleware itself)
        slot.transferring.add(request)

        def finish_transferring(_):
            slot.transferring.remove(request)
            self._process_queue(spider, slot)
            return _

        return dfd.addBoth(finish_transferring)
Example #45
0
    def scrape_response(self, scrape_func: ScrapeFunc, response: Response,
                        request: Request, spider: Spider) -> Deferred:
        async def process_callback_output(
            result: Union[Iterable, AsyncIterable]
        ) -> Union[MutableChain, MutableAsyncChain]:
            return await self._process_callback_output(response, spider,
                                                       result)

        def process_spider_exception(
                _failure: Failure) -> Union[Failure, MutableChain]:
            return self._process_spider_exception(response, spider, _failure)

        dfd = mustbe_deferred(self._process_spider_input, scrape_func,
                              response, request, spider)
        dfd.addCallbacks(
            callback=deferred_f_from_coro_f(process_callback_output),
            errback=process_spider_exception)
        return dfd
Example #46
0
    def _download(self, slot, request, spider):
        # The order is very important for the following deferreds. Do not change!

        request.meta['the_id_from_scrapy'] = 0

        # 1. Create the download deferred

        def download_request(request, spider):
            starttime = str(datetime.datetime.now())
            request.meta['the_id_from_scrapy'] = self.write_database_start(spider.name, starttime)
            return self.handlers[spider.name].download_request(request, spider)

        dfd = mustbe_deferred(download_request, request, spider)

        # 2. Notify response_downloaded listeners about the recent download
        # before querying queue for next request
        def _downloaded(response):
            size = len(str(response.body))
            self.write_database_end(request.meta['the_id_from_scrapy'], str(datetime.datetime.now()), size)
            self.signals[spider.name].send_catch_log(signal=signals.response_downloaded,
                                                     response=response,
                                                     request=request,
                                                     spider=spider)
            return response

        dfd.addCallback(_downloaded)

        def _err_handle(response):
            self.delete_bad_with_id(request.meta['the_id_from_scrapy'])
            return response

        dfd.addErrback(_err_handle)
        # 3. After response arrives,  remove the request from transferring
        # state to free up the transferring slot so it can be used by the
        # following requests (perhaps those which came from the downloader
        # middleware itself)
        slot.transferring.add(request)

        def finish_transferring(_):
            slot.transferring.remove(request)
            self._process_queue(spider, slot)
            return _

        return dfd.addBoth(finish_transferring)
Example #47
0
    def download(self, download_func, request, spider):
        def process_request(request):
            for method in self.methods['process_request']:
                response = method(request=request, spider=spider)
                assert response is None or isinstance(response, (Response, Request)), \
                        'Middleware %s.process_request must return None, Response or Request, got %s' % \
                        (six.get_method_self(method).__class__.__name__, response.__class__.__name__)
                if response:
                    return response
            return download_func(request=request, spider=spider)

        def process_response(response):
            assert response is not None, 'Received None in process_response'
            if isinstance(response, Request):
                return response

            for method in self.methods['process_response']:
                response = method(request=request,
                                  response=response,
                                  spider=spider)
                assert isinstance(response, (Response, Request)), \
                    'Middleware %s.process_response must return Response or Request, got %s' % \
                    (six.get_method_self(method).__class__.__name__, type(response))
                if isinstance(response, Request):
                    return response
            return response

        def process_exception(_failure):
            exception = _failure.value
            for method in self.methods['process_exception']:
                response = method(request=request,
                                  exception=exception,
                                  spider=spider)
                assert response is None or isinstance(response, (Response, Request)), \
                    'Middleware %s.process_exception must return None, Response or Request, got %s' % \
                    (six.get_method_self(method).__class__.__name__, type(response))
                if response:
                    return response
            return _failure

        deferred = mustbe_deferred(process_request, request)
        deferred.addErrback(process_exception)
        deferred.addCallback(process_response)
        return deferred
Example #48
0
    def download(self, download_func, request, spider):
        def process_request(request):
            for method in self.request_middleware:
                response = method(request=request, spider=spider)
                assert response is None or isinstance(response, (Response, Request)), \
                        'Middleware %s.process_request must return None, Response or Request, got %s' % \
                        (method.im_self.__class__.__name__, response.__class__.__name__)
                if response:
                    return response
            return download_func(request=request, spider=spider)

        def process_response(response):
            assert response is not None, 'Received None in process_response'
            if isinstance(response, Request):
                return response

            for method in self.response_middleware:
                response = method(request=request, response=response, spider=spider)
                assert isinstance(response, (Response, Request)), \
                    'Middleware %s.process_response must return Response or Request, got %s' % \
                    (method.im_self.__class__.__name__, type(response))
                if isinstance(response, Request):
                    send_catch_log(signal=signals.response_received, \
                        sender=self.__class__, response=response, spider=spider)
                    return response
            send_catch_log(signal=signals.response_received, sender=self.__class__, \
                response=response, spider=spider)
            return response

        def process_exception(_failure):
            exception = _failure.value
            for method in self.exception_middleware:
                response = method(request=request, exception=exception, spider=spider)
                assert response is None or isinstance(response, (Response, Request)), \
                    'Middleware %s.process_exception must return None, Response or Request, got %s' % \
                    (method.im_self.__class__.__name__, type(response))
                if response:
                    return response
            return _failure

        deferred = mustbe_deferred(process_request, request)
        deferred.addErrback(process_exception)
        deferred.addCallback(process_response)
        return deferred
Example #49
0
    def process_pokemon_request(self, request, info):
        fingerprint = request_fingerprint(request)
        callback = request.callback or (lambda _: _)
        errorback = request.errback
        request.callback = None
        request.errback = None

        # Otherwise, wait for result
        wad = Deferred().addCallbacks(callback, errorback)
        info.waiting[fingerprint].append(wad)

        info.downloading.add(fingerprint)
        dfd = mustbe_deferred(self.media_to_download, request, info)
        dfd.addCallback(self._check_media_to_download, request, info)
        dfd.addBoth(self._cache_result_and_execute_waiters, fingerprint, info)
        dfd.addErrback(lambda f: logger.error(f.value,
                                              exc_info=failure_to_exc_info(f),
                                              extra={'spider': info.spider}))
        return dfd.addBoth(lambda _: wad)  # it must return wad at last
Example #50
0
    def scrape_response(self, scrape_func, response, request, spider):
        fname = lambda f: '%s.%s' % (f.im_self.__class__.__name__, f.im_func.
                                     __name__)

        def process_spider_input(response):
            for method in self.methods['process_spider_input']:
                try:
                    result = method(response=response, spider=spider)
                    assert result is None, \
                            'Middleware %s must returns None or ' \
                            'raise an exception, got %s ' \
                            % (fname(method), type(result))
                except:
                    return scrape_func(Failure(), request, spider)
            return scrape_func(response, request, spider)

        def process_spider_exception(_failure):
            exception = _failure.value
            for method in self.methods['process_spider_exception']:
                result = method(response=response,
                                exception=exception,
                                spider=spider)
                assert result is None or _isiterable(result), \
                    'Middleware %s must returns None, or an iterable object, got %s ' % \
                    (fname(method), type(result))
                if result is not None:
                    return result
            return _failure

        def process_spider_output(result):
            for method in self.methods['process_spider_output']:
                result = method(response=response,
                                result=result,
                                spider=spider)
                assert _isiterable(result), \
                    'Middleware %s must returns an iterable object, got %s ' % \
                    (fname(method), type(result))
            return result

        dfd = mustbe_deferred(process_spider_input, response)
        dfd.addErrback(process_spider_exception)
        dfd.addCallback(process_spider_output)
        return dfd
Example #51
0
    def download(self, download_func, request, spider):
        def process_request(request):
            for method in self.methods['process_request']:
                response = method(request=request, spider=spider)
                assert response is None or isinstance(response, (Response, Request)), \
                        'Middleware %s.process_request must return None, Response or Request, got %s' % \
                        (method.im_self.__class__.__name__, response.__class__.__name__)
                if response:
                    return response
            return download_func(request=request, spider=spider)

        def process_response(response):
            assert response is not None, 'Received None in process_response'
            if isinstance(response, Request):
                return response

            for method in self.methods['process_response']:
                try:
                    response = method(request=request, response=response, spider=spider)
                except:
                    return process_exception(Failure())
                assert isinstance(response, (Response, Request)), \
                    'Middleware %s.process_response must return Response or Request, got %s' % \
                    (method.im_self.__class__.__name__, type(response))
                if isinstance(response, Request):
                    return response
            return response

        def process_exception(_failure):
            exception = _failure.value
            for method in self.methods['process_exception']:
                response = method(request=request, exception=exception, spider=spider)
                assert response is None or isinstance(response, (Response, Request)), \
                    'Middleware %s.process_exception must return None, Response or Request, got %s' % \
                    (method.im_self.__class__.__name__, type(response))
                if response:
                    return response
            return _failure

        deferred = mustbe_deferred(process_request, request)
        deferred.addErrback(process_exception)
        deferred.addCallback(process_response)
        return deferred
Example #52
0
    def download(self, request, spider):
        def _on_success(response):
            """handle the result of a page download"""
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                log.msg(self._crawled_logline(request, response), \
                    level=log.DEBUG, spider=spider)
                return response
            elif isinstance(response, Request):
                newrequest = response
                dfd = mustbe_deferred(self.schedule, newrequest, spider)
                if newrequest.callback:
                    # XXX: this is a bit hacky and should be removed
                    dfd.addCallbacks(newrequest.callback, newrequest.errback)
                return dfd

        def _on_error(_failure):
            """handle an error processing a page"""
            exc = _failure.value
            if isinstance(exc, IgnoreRequest):
                errmsg = _failure.getErrorMessage()
                level = exc.level
            else:
                errmsg = str(_failure)
                level = log.ERROR
            if errmsg:
                log.msg("Crawling <%s>: %s" % (request.url, errmsg), \
                    level=level, spider=spider)
            return Failure(IgnoreRequest(str(exc)))

        def _on_complete(_):
            self.next_request(spider)
            return _

        if spider not in self.downloader.sites:
            return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete)

        dwld = mustbe_deferred(self.downloader.fetch, request, spider)
        dwld.addCallbacks(_on_success, _on_error)
        dwld.addBoth(_on_complete)
        return dwld
Example #53
0
    def scrape_response(self, scrape_func, response, request, spider):
        fname = lambda f: '%s.%s' % (
            f.im_self.__class__.__name__, f.im_func.__name__)

        def process_spider_input(response):
            for method in self.methods['process_spider_input']:
                try:
                    result = method(response=response, spider=spider)
                    assert result is None, \
                        'Middleware %s must returns None or ' \
                        'raise an exception, got %s ' \
                        % (fname(method), type(result))
                except:
                    return scrape_func(Failure(), request, spider)
            return scrape_func(response, request, spider)

        def process_spider_exception(_failure):
            exception = _failure.value
            for method in self.methods['process_spider_exception']:
                result = method(
                    response=response, exception=exception, spider=spider)
                assert result is None or _isiterable(result), \
                    'Middleware %s must returns None, or an iterable object, got %s ' % \
                    (fname(method), type(result))
                if result is not None:
                    return result
            return _failure

        def process_spider_output(result):
            for method in self.methods['process_spider_output']:
                result = method(
                    response=response, result=result, spider=spider)
                assert _isiterable(result), \
                    'Middleware %s must returns an iterable object, got %s ' % \
                    (fname(method), type(result))
            return result

        dfd = mustbe_deferred(process_spider_input, response)
        dfd.addErrback(process_spider_exception)
        dfd.addCallback(process_spider_output)
        return dfd
Example #54
0
    def _download(self, request, spider):
        slot = self.slots[spider]
        slot.add_request(request)
        def _on_success(response):
            """handle the result of a page download"""
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                log.msg(log.formatter.crawled(request, response, spider), \
                    level=log.DEBUG, spider=spider)
                send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            self.next_request(spider)
            return _

        dwld = mustbe_deferred(self.downloader.fetch, request, spider)
        dwld.addCallback(_on_success)
        dwld.addBoth(_on_complete)
        return dwld
Example #55
0
    def _download(self, site, request, spider):
        # The order is very important for the following deferreds. Do not change!

        # 1. Create the download deferred
        dfd = mustbe_deferred(download_any, request, spider)

        # 2. After response arrives,  remove the request from transferring
        # state to free up the transferring slot so it can be used by the
        # following requests (perhaps those which came from the downloader
        # middleware itself)
        site.transferring.add(request)
        def finish_transferring(_):
            site.transferring.remove(request)
            self._process_queue(spider)
            # avoid partially downloaded responses from propagating to the
            # downloader middleware, to speed-up the closing process
            if site.closing:
                log.msg("Crawled while closing spider: %s" % request, \
                    level=log.DEBUG, spider=spider)
                raise IgnoreRequest
            return _
        return dfd.addBoth(finish_transferring)
Example #56
0
    def _download(self, slot, request, spider):
        # The order is very important for the following deferreds. Do not change!

        # 1. Create the download deferred
        dfd = mustbe_deferred(self.handlers.download_request, request, spider)

        # 2. Notify response_downloaded listeners about the recent download
        # before querying queue for next request
        # 在查询队列中的下一个请求之前,通知response_downloaded的侦听器有关最近的下载的信息
        def _downloaded(response):
            # s.response_downloaded 正在获取响应的位置
            self.signals.send_catch_log(signal=signals.response_downloaded,
                                        response=response,
                                        request=request,
                                        spider=spider)
            return response

        dfd.addCallback(_downloaded)

        # 3. After response arrives, remove the request from transferring
        # state to free up the transferring slot so it can be used by the
        # following requests (perhaps those which came from the downloader
        # middleware itself)
        # 响应到达后,将请求从传输状态中删除以释放传输插槽,以便随后的请求可以使用它(也许来自下载器中间件本身的请求)
        slot.transferring.add(request)

        def finish_transferring(_):
            slot.transferring.remove(request)
            self._process_queue(spider, slot)
            # s.request_left_downloader 刚离开下载器
            self.signals.send_catch_log(signal=signals.request_left_downloader,
                                        request=request,
                                        spider=spider)
            return _

        return dfd.addBoth(finish_transferring)
Example #57
0
    def _download(self, site, request, spider):
        # The order is very important for the following deferreds. Do not change!

        # 1. Create the download deferred
        dfd = mustbe_deferred(self.handlers.download_request, request, spider)

        # 2. After response arrives,  remove the request from transferring
        # state to free up the transferring slot so it can be used by the
        # following requests (perhaps those which came from the downloader
        # middleware itself)
        site.transferring.add(request)

        def finish_transferring(_):
            site.transferring.remove(request)
            self._process_queue(spider)
            # avoid partially downloaded responses from propagating to the
            # downloader middleware, to speed-up the closing process
            if site.closing:
                log.msg("Crawled while closing spider: %s" % request, \
                    level=log.DEBUG, spider=spider)
                raise IgnoreRequest
            return _

        return dfd.addBoth(finish_transferring)
    def scrape_response(self, scrape_func, response, request, spider):
        #scrapy_func是什么东西??

        fname = lambda f:'%s.%s' % (f.im_self.__class__.__name__, f.im_func.__name__)
        #fname(f):传入f
        #返回f.im_self.__class__.__name__ 和f.im_func.__name__连接的字符串。


        def process_spider_input(response):
            #依次调用methods中存入的,所有中间件中的process_spider_input方法。
            for method in self.methods['process_spider_input']:
                try:
                    result = method(response=response, spider=spider)
                    #每一个中间件处理input完成的result没有记录或者处理
                    #process_spider_inpurt1,2,3,4,5,6...(response,spider)

                    assert result is None, \
                            'Middleware %s must returns None or ' \
                            'raise an exception, got %s ' \
                            % ( fname(method), type( result ) )
                    #如果result为None,则表示调用成功。
                    #猜测:可能repsonse在process中已经被修改了某些部分,
                    #所以没有必要记录返回的result。
                except:
                    return scrape_func(Failure(), request, spider)
                #猜测:如果有一个中间件执行错误。应该是调用错误回调。
                #猜测:Failure()实例作为input(或者result)返回到下一错误链。

            return scrape_func(response, request, spider)
            #猜测:如果执行正确,则以response为input(或者result)返回到回调链。


        def process_spider_exception(_failure):
            exception = _failure.value
            #exception
            for method in self.methods['process_spider_exception']:
                result = method(response=response, exception=exception, spider=spider)
                assert result is None or _isiterable(result), \
                    'Middleware %s must returns None, or an iterable object, got %s ' % \
                    (fname(method), type(result))

                #如果不是None,就从链中断开,直接返回result。
                #猜测:这个result会不会返回到callback??
                if result is not None:
                    return result
            return _failure

        def process_spider_output(result):
            for method in self.methods['process_spider_output']:
                result = method(response=response, result=result, spider=spider)
                assert _isiterable(result), \
                    'Middleware %s must returns an iterable object, got %s ' % \
                    (fname(method), type(result))
            return result

        dfd = mustbe_deferred(process_spider_input, response)
        #mustbe_deffered:首先调用result = process_spider_input(response)
        #出错就一会调用errback
        #result没问题则调用下一轮处理result的延迟。

        dfd.addErrback(process_spider_exception)
        #添加errback


        dfd.addCallback(process_spider_output)
        #添加callback

        return dfd
Example #59
0
    def scrape_response(self, scrape_func, response, request, spider):
        fname = lambda f:'%s.%s' % (
                six.get_method_self(f).__class__.__name__,
                six.get_method_function(f).__name__)

        def process_spider_input(response):
            for method in self.methods['process_spider_input']:
                try:
                    result = method(response=response, spider=spider)
                    if result is not None:
                        raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \
                                             .format(fname(method), type(result)))
                except _InvalidOutput:
                    raise
                except Exception:
                    return scrape_func(Failure(), request, spider)
            return scrape_func(response, request, spider)

        def process_spider_exception(_failure, start_index=0):
            exception = _failure.value
            # don't handle _InvalidOutput exception
            if isinstance(exception, _InvalidOutput):
                return _failure
            method_list = islice(self.methods['process_spider_exception'], start_index, None)
            for method_index, method in enumerate(method_list, start=start_index):
                if method is None:
                    continue
                result = method(response=response, exception=exception, spider=spider)
                if _isiterable(result):
                    # stop exception handling by handing control over to the
                    # process_spider_output chain if an iterable has been returned
                    return process_spider_output(result, method_index+1)
                elif result is None:
                    continue
                else:
                    raise _InvalidOutput('Middleware {} must return None or an iterable, got {}' \
                                         .format(fname(method), type(result)))
            return _failure

        def process_spider_output(result, start_index=0):
            # items in this iterable do not need to go through the process_spider_output
            # chain, they went through it already from the process_spider_exception method
            recovered = MutableChain()

            def evaluate_iterable(iterable, index):
                try:
                    for r in iterable:
                        yield r
                except Exception as ex:
                    exception_result = process_spider_exception(Failure(ex), index+1)
                    if isinstance(exception_result, Failure):
                        raise
                    recovered.extend(exception_result)

            method_list = islice(self.methods['process_spider_output'], start_index, None)
            for method_index, method in enumerate(method_list, start=start_index):
                if method is None:
                    continue
                # the following might fail directly if the output value is not a generator
                try:
                    result = method(response=response, result=result, spider=spider)
                except Exception as ex:
                    exception_result = process_spider_exception(Failure(ex), method_index+1)
                    if isinstance(exception_result, Failure):
                        raise
                    return exception_result
                if _isiterable(result):
                    result = evaluate_iterable(result, method_index)
                else:
                    raise _InvalidOutput('Middleware {} must return an iterable, got {}' \
                                         .format(fname(method), type(result)))

            return chain(result, recovered)

        dfd = mustbe_deferred(process_spider_input, response)
        dfd.addCallbacks(callback=process_spider_output, errback=process_spider_exception)
        return dfd
Example #60
0
 def fetch(self, url):
     log.debug("Fetch URL %s" % url)
     request = Request(url=url)
     self.process_request(request)
     return mustbe_deferred(self.downloader.download_request, request, None)