def _check_media_to_download(self, result, request, info): if result is not None: return result # Download request and process its response return mustbe_deferred(self.download, request, info).addCallbacks( callback=self.media_downloaded, callbackArgs=(request, info), errback=self.media_failed, errbackArgs=(request, info))
def _download(self, slot, request, spider): # The order is very important for the following deferreds. Do not change! # 1. Create the download deferred dfd = mustbe_deferred(self.handlers.download_request, request, spider) # 2. Notify response_downloaded listeners about the recent download # before querying queue for next request def _downloaded(response): self.signals.send_catch_log(signal=signals.response_downloaded, response=response, request=request, spider=spider) return response dfd.addCallback(_downloaded) # 3. After response arrives, remove the request from transferring # state to free up the transferring slot so it can be used by the # following requests (perhaps those which came from the downloader # middleware itself) slot.transferring.add(request) def finish_transferring(_): slot.transferring.remove(request) self._process_queue(spider, slot) return _ return dfd.addBoth(finish_transferring)
def _download(self, slot, request, spider): # The order is very important for the following deferreds. Do not change! # 1. Create the download deferred dfd = mustbe_deferred(self.handlers.download_request, request, spider) # 2. Notify response_downloaded listeners about the recent download # before querying queue for next request def _downloaded(response): self.signals.send_catch_log( signal=signals.response_downloaded, response=response, request=request, spider=spider ) return response dfd.addCallback(_downloaded) # 3. After response arrives, remove the request from transferring # state to free up the transferring slot so it can be used by the # following requests (perhaps those which came from the downloader # middleware itself) slot.transferring.add(request) def finish_transferring(_): slot.transferring.remove(request) self._process_queue(spider, slot) return _ return dfd.addBoth(finish_transferring)
def _process_request(self, request, info): fp = request_fingerprint(request) cb = request.callback or (lambda _: _) eb = request.errback request.callback = None request.errback = None # Return cached result if request was already seen if fp in info.downloaded: return defer_result(info.downloaded[fp]).addCallbacks(cb, eb) # Otherwise, wait for result wad = Deferred().addCallbacks(cb, eb) info.waiting[fp].append(wad) # Check if request is downloading right now to avoid doing it twice if fp in info.downloading: return wad # Download request checking media_to_download hook output first info.downloading.add(fp) dfd = mustbe_deferred(self.media_to_download, request, info) dfd.addCallback(self._check_media_to_download, request, info) dfd.addBoth(self._cache_result_and_execute_waiters, fp, info) dfd.addErrback(lambda f: logger.error( f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider}) ) return dfd.addBoth(lambda _: wad) # it must return wad at last
def download(self, request, spider): def _on_success(response): """handle the result of a page download""" assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received log.msg(log.formatter.crawled(request, response, spider), \ level=log.DEBUG, spider=spider) return response elif isinstance(response, Request): return mustbe_deferred(self.schedule, response, spider) def _on_error(_failure): """handle an error processing a page""" exc = _failure.value if isinstance(exc, IgnoreRequest): errmsg = _failure.getErrorMessage() else: errmsg = str(_failure) if errmsg: log.msg("Error downloading <%s>: %s" % (request.url, errmsg), \ level=log.ERROR, spider=spider) return Failure(IgnoreRequest(str(exc))) def _on_complete(_): self.next_request(spider) return _ if spider not in self.downloader.sites: return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete) dwld = mustbe_deferred(self.downloader.fetch, request, spider) dwld.addCallbacks(_on_success, _on_error) dwld.addBoth(_on_complete) return dwld
def _process_request(self, request, info): fp = request_fingerprint(request) cb = request.callback or (lambda _: _) eb = request.errback request.callback = None request.errback = None # Return cached result if request was already seen if fp in info.downloaded: return defer_result(info.downloaded[fp]).addCallbacks(cb, eb) # Otherwise, wait for result wad = Deferred().addCallbacks(cb, eb) info.waiting[fp].append(wad) # Check if request is downloading right now to avoid doing it twice if fp in info.downloading: return wad # Download request checking media_to_download hook output first info.downloading.add(fp) dfd = mustbe_deferred(self.media_to_download, request, info) dfd.addCallback(self._check_media_to_download, request, info) dfd.addBoth(self._cache_result_and_execute_waiters, fp, info) dfd.addErrback(log.err, spider=info.spider) return dfd.addBoth(lambda _: wad) # it must return wad at last
def _download(self, slot, request, spider): # The order is very important for the following deferreds. Do not # change! # 1. Create the download deferred, 注册, 调用handlers的download_request, 发起真正下载 # 见scrapy/core/downloader/handlers/__init__.py, 这里注册并调用download_request. # 其维护一个下载队列, 根据配置达到延时下载的目的 dfd = mustbe_deferred(self.handlers.download_request, request, spider) # 2. Notify response_downloaded listeners about the recent download # before querying queue for next request # 注册回调方法: def _downloaded(response): self.signals.send_catch_log(signal=signals.response_downloaded, response=response, request=request, spider=spider) return response dfd.addCallback(_downloaded) # 3. After response arrives, remove the request from transferring # state to free up the transferring slot so it can be used by the # following requests (perhaps those which came from the downloader # middleware itself) slot.transferring.add(request) # 在返回响应之后, 移除request def finish_transferring(_): # 注册回调方法, response返回时 slot.transferring.remove(request) # 下载完成后调用 self._process_queue(spider, slot) return _ return dfd.addBoth(finish_transferring)
def download(self, download_func, request, spider): # 这里的 download_func 实际就是downloader的_enqueue_request @defer.inlineCallbacks def process_request(request): for method in self.methods['process_request']: response = yield deferred_from_coro( method(request=request, spider=spider) ) # 这个deferred_from_coro方法是将 middleware的方法 从 asyncio 转化为 recator方法 并yield出去 if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__self__.__class__.__name__}" ".process_request must return None, Response or " f"Request, got {response.__class__.__name__}") if response: return response return (yield download_func(request=request, spider=spider)) @defer.inlineCallbacks def process_response(response): if response is None: raise TypeError("Received None in process_response") elif isinstance(response, Request): return response for method in self.methods['process_response']: response = yield deferred_from_coro( method(request=request, response=response, spider=spider)) if not isinstance(response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__self__.__class__.__name__}" ".process_response must return Response or Request, " f"got {type(response)}") if isinstance(response, Request): return response return response @defer.inlineCallbacks def process_exception(failure): exception = failure.value for method in self.methods['process_exception']: response = yield deferred_from_coro( method(request=request, exception=exception, spider=spider)) if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__self__.__class__.__name__}" ".process_exception must return None, Response or " f"Request, got {type(response)}") if response: return response return failure deferred = mustbe_deferred( process_request, request) #类似于maybedeffered 但是不在这轮次里 运行callback 和 errback deferred.addErrback(process_exception) deferred.addCallback(process_response) return deferred
def _crawl(self, request): dfd = mustbe_deferred(self._handler.download_request, request, None) def downloaded(response, request): result = {"status" : 600, "doc" : None, "headers" : None} if isinstance(response, Response): response = self._redirect_middleware.process_response(request, response, None) if isinstance(response, Response): result["url"] = response.url result["status"] = response.status result["doc"] = response.body result["headers"] = response.headers result["meta"] = request.meta elif isinstance(response, Request): redirect_time = response.meta.get("redirect_time", 0) redirect_time += 1 if redirect_time >= 2: result["url"] = response.url result["status"] = 601 result["meta"] = response.meta else: result["url"] = response.url result["status"] = 700 result["meta"] = response.meta result["redirect_time"] = redirect_time else: raise Exception("not supported") else: result["url"] = request.url result["error_message"] = "crawler failed:%s" % response result["meta"] = request.meta return result return dfd.addBoth(downloaded, request)
def _download(self, request, info, fp): def _downloaded(result): info.downloading.pop(fp) info.downloaded[fp] = result for wad in info.waiting.pop(fp): # pass result to each waiting client defer_result(result).chainDeferred(wad) def _post_media_to_download(result): if result is None: # continue with download dwld = mustbe_deferred(self.download, request, info) dwld.addCallbacks( callback=self.media_downloaded, callbackArgs=(request, info), errback=self.media_failed, errbackArgs=(request, info)) else: # or use media_to_download return value as result dwld = defer_result(result) info.downloading[fp] = (request, dwld) # fill downloading state data dwld.addBoth(_downloaded) # append post-download hook dwld.addErrback(log.err, spider=info.spider) # declare request in downloading state (None is used as place holder) info.downloading[fp] = None # defer pre-download request processing dfd = mustbe_deferred(self.media_to_download, request, info) dfd.addCallback(_post_media_to_download)
def download(self, download_func: Callable, request: Request, spider: Spider): @defer.inlineCallbacks def process_request(request: Request): for method in self.methods['process_request']: method = cast(Callable, method) response = yield deferred_from_coro( method(request=request, spider=spider)) if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__qualname__} must return None, Response or " f"Request, got {response.__class__.__name__}") if response: return response return (yield download_func(request=request, spider=spider)) @defer.inlineCallbacks def process_response(response: Union[Response, Request]): if response is None: raise TypeError("Received None in process_response") elif isinstance(response, Request): return response for method in self.methods['process_response']: method = cast(Callable, method) response = yield deferred_from_coro( method(request=request, response=response, spider=spider)) if not isinstance(response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__qualname__} must return Response or Request, " f"got {type(response)}") if isinstance(response, Request): return response return response @defer.inlineCallbacks def process_exception(failure: Failure): exception = failure.value for method in self.methods['process_exception']: method = cast(Callable, method) response = yield deferred_from_coro( method(request=request, exception=exception, spider=spider)) if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__qualname__} must return None, Response or " f"Request, got {type(response)}") if response: return response return failure deferred = mustbe_deferred(process_request, request) deferred.addErrback(process_exception) deferred.addCallback(process_response) return deferred
def _next_request(self, spider): # Next pending request from scheduler request, deferred = self.scheduler.next_request(spider) if request: dwld = mustbe_deferred(self.download, request, spider) dwld.chainDeferred(deferred).addBoth(lambda _: deferred) dwld.addErrback(log.err, "Unhandled error on engine._next_request()", spider=spider) return dwld
def next_stage(item, stages_left): assert isinstance(item, BaseItem), \ 'Item pipelines must return a BaseItem, got %s' % type(item).__name__ if not stages_left: return item current_stage = stages_left.pop(0) d = mustbe_deferred(current_stage.process_item, spider, item) d.addCallback(next_stage, stages_left) return d
def download(self, download_func, request, spider): @defer.inlineCallbacks def process_request(request): for method in self.methods[ 'process_request']: # 挨个执行下载器中间件的process_request方法 response = yield method( request=request, spider=spider ) # 看,process_request方法就是在此处把requests和spider传入的 if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, response.__class__.__name__)) if response: defer.returnValue(response) defer.returnValue( (yield download_func(request=request, spider=spider) )) # 如果下载器中间件没有返回值,则执行注册进来的方法,也就是Downloader的_enqueue_request @defer.inlineCallbacks def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): defer.returnValue(response) for method in self.methods['process_response']: response = yield method(request=request, response=response, spider=spider) if not isinstance(response, (Response, Request)): raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response))) if isinstance(response, Request): defer.returnValue(response) defer.returnValue(response) @defer.inlineCallbacks def process_exception(_failure): exception = _failure.value for method in self.methods['process_exception']: response = yield method(request=request, exception=exception, spider=spider) if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response))) if response: defer.returnValue(response) defer.returnValue(_failure) deferred = mustbe_deferred( process_request, request ) # 这里会马上调用process_request,处理完各个中间件的process_request方法,然后将request发出下载 deferred.addErrback(process_exception) deferred.addCallback(process_response) return deferred
def _on_success(response): """handle the result of a page download""" assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received log.msg(log.formatter.crawled(request, response, spider), \ level=log.DEBUG, spider=spider) return response elif isinstance(response, Request): return mustbe_deferred(self.schedule, response, spider)
def test_success_function(self): steps = [] def _append(v): steps.append(v) return steps dfd = mustbe_deferred(_append, 1) dfd.addCallback(self.assertEqual, [1, 2]) # it is [1] with maybeDeferred steps.append(2) # add another value, that should be catched by assertEqual return dfd
def download(self, download_func, request, spider): @defer.inlineCallbacks def process_request(request): for method in self.methods['process_request']: response = yield deferred_from_coro( method(request=request, spider=spider)) if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput( "Middleware %s.process_request must return None, Response or Request, got %s" % (method.__self__.__class__.__name__, response.__class__.__name__)) if response: defer.returnValue(response) defer.returnValue((yield download_func(request=request, spider=spider))) @defer.inlineCallbacks def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): defer.returnValue(response) for method in self.methods['process_response']: response = yield deferred_from_coro( method(request=request, response=response, spider=spider)) if not isinstance(response, (Response, Request)): raise _InvalidOutput( "Middleware %s.process_response must return Response or Request, got %s" % (method.__self__.__class__.__name__, type(response))) if isinstance(response, Request): defer.returnValue(response) defer.returnValue(response) @defer.inlineCallbacks def process_exception(_failure): exception = _failure.value for method in self.methods['process_exception']: response = yield deferred_from_coro( method(request=request, exception=exception, spider=spider)) if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput( "Middleware %s.process_exception must return None, Response or Request, got %s" % (method.__self__.__class__.__name__, type(response))) if response: defer.returnValue(response) defer.returnValue(_failure) deferred = mustbe_deferred(process_request, request) deferred.addErrback(process_exception) deferred.addCallback(process_response) return deferred
def scrape_response(self, scrape_func, response, request, spider): def process_callback_output(result): return self._process_callback_output(response, spider, result) def process_spider_exception(_failure): return self._process_spider_exception(response, spider, _failure) dfd = mustbe_deferred(self._process_spider_input, scrape_func, response, request, spider) dfd.addCallbacks(callback=process_callback_output, errback=process_spider_exception) return dfd
def test_unfired_deferred(self): steps = [] def _append(v): steps.append(v) dfd = defer.Deferred() reactor.callLater(0, dfd.callback, steps) return dfd dfd = mustbe_deferred(_append, 1) dfd.addCallback(self.assertEqual, [1, 2]) # it is [1] with maybeDeferred steps.append(2) # add another value, that should be catched by assertEqual return dfd
def download(self, download_func, request, spider): @defer.inlineCallbacks def process_request(request): for method in self.methods['process_request']: response = yield method(request=request, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_request must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, response.__class__.__name__) if response: defer.returnValue(response) try: defer.returnValue((yield download_func(request=request, spider=spider))) except Exception as e: with open('eroor', 'a') as w: w.write(request.url + '\n') @defer.inlineCallbacks def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): defer.returnValue(response) for method in self.methods['process_response']: response = yield method(request=request, response=response, spider=spider) assert isinstance(response, (Response, Request)), \ 'Middleware %s.process_response must return Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response)) if isinstance(response, Request): defer.returnValue(response) defer.returnValue(response) @defer.inlineCallbacks def process_exception(_failure): exception = _failure.value for method in self.methods['process_exception']: response = yield method(request=request, exception=exception, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_exception must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response)) if response: defer.returnValue(response) defer.returnValue(_failure) deferred = mustbe_deferred(process_request, request) deferred.addErrback(process_exception) deferred.addCallback(process_response) return deferred
def crawl(self, request, spider): assert spider in self.open_spiders, \ "Spider %r not opened when crawling: %s" % (spider.name, request) if spider in self.closing: # ignore requests for spiders being closed return schd = mustbe_deferred(self.schedule, request, spider) # FIXME: we can't log errors because we would be preventing them from # propagating to the request errback. This should be fixed after the # next core refactoring. #schd.addErrback(log.err, "Error on engine.crawl()") schd.addBoth(self.scraper.enqueue_scrape, request, spider) schd.addErrback(log.err, "Unhandled error on engine.crawl()", spider=spider) schd.addBoth(lambda _: self.next_request(spider))
def _on_success(response): """handle the result of a page download""" assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received log.msg(self._crawled_logline(request, response), \ level=log.DEBUG, spider=spider) return response elif isinstance(response, Request): newrequest = response schd = mustbe_deferred(self.schedule, newrequest, spider) schd.chainDeferred(newrequest.deferred) return newrequest.deferred
def crawl(self, request, spider): if not request.deferred.callbacks: log.msg("Unable to crawl Request with no callback: %s" % request, level=log.ERROR, spider=spider) return schd = mustbe_deferred(self.schedule, request, spider) # FIXME: we can't log errors because we would be preventing them from # propagating to the request errback. This should be fixed after the # next core refactoring. #schd.addErrback(log.err, "Error on engine.crawl()") schd.addBoth(self.scraper.enqueue_scrape, request, spider) schd.addErrback(log.err, "Unhandled error on engine.crawl()", spider=spider) schd.addBoth(lambda _: self.next_request(spider))
def enqueue_request(self, wrappedfunc, spider, request): def _enqueue_request(request): for mwfunc in self.methods['enqueue_request']: result = mwfunc(spider=spider, request=request) assert result is None or isinstance(result, Deferred), \ 'Middleware %s.enqueue_request must return None or Deferred, got %s' % \ (mwfunc.im_self.__class__.__name__, result.__class__.__name__) if result: return result return wrappedfunc(spider=spider, request=request) deferred = mustbe_deferred(_enqueue_request, request) return deferred
def scrape_response(self, scrape_func, response, request, spider): fname = lambda f: '%s.%s' % (six.get_method_self(f).__class__.__name__, six.get_method_function(f).__name__) def process_spider_input(response): ## 执行一系列爬虫中间件的 process_spider_input 方法 ## 再执行 scrape_func 即 call_spider 方法 for method in self.methods['process_spider_input']: try: result = method(response=response, spider=spider) assert result is None, \ 'Middleware %s must returns None or ' \ 'raise an exception, got %s ' \ % (fname(method), type(result)) except: return scrape_func(Failure(), request, spider) return scrape_func(response, request, spider) def process_spider_exception(_failure): ## 执行一系列爬虫中间件的 process_spider_exception 方法 exception = _failure.value for method in self.methods['process_spider_exception']: result = method(response=response, exception=exception, spider=spider) assert result is None or _isiterable(result), \ 'Middleware %s must returns None, or an iterable object, got %s ' % \ (fname(method), type(result)) if result is not None: return result return _failure def process_spider_output(result): ## 执行一系列爬虫中间件的 process_spider_output 方法 for method in self.methods['process_spider_output']: result = method(response=response, result=result, spider=spider) assert _isiterable(result), \ 'Middleware %s must returns an iterable object, got %s ' % \ (fname(method), type(result)) return result ## 执行 process_spider_input 方法 dfd = mustbe_deferred(process_spider_input, response) ## 注册异常回调 dfd.addErrback(process_spider_exception) ## 注册出口回调 dfd.addCallback(process_spider_output) return dfd
def _post_media_to_download(result): if result is None: # continue with download dwld = mustbe_deferred(self.download, request, info) dwld.addCallbacks( callback=self.media_downloaded, callbackArgs=(request, info), errback=self.media_failed, errbackArgs=(request, info)) else: # or use media_to_download return value as result dwld = defer_result(result) info.downloading[fp] = (request, dwld) # fill downloading state data dwld.addBoth(_downloaded) # append post-download hook dwld.addErrback(log.err, spider=info.spider)
def scrape_response(self, scrape_func, response, request, spider): fname = lambda f: '%s.%s' % (six.get_method_self(f).__class__.__name__, six.get_method_function(f).__name__) """他会先执行完某一函数的所有回调情况,再往下走????whats???? 似乎每一个爬虫流程都会走一遍,whats 执行process_spider_input 执行call_spider 执行process_spider_output """ def process_spider_input(response): for method in self.methods['process_spider_input']: try: result = method(response=response, spider=spider) assert result is None, \ 'Middleware %s must returns None or ' \ 'raise an exception, got %s ' \ % (fname(method), type(result)) except: return scrape_func(Failure(), request, spider) return scrape_func( response, request, spider) # 这里就是call_spider爬虫处理后的结果,是一个dfd,里面指定了下一级的回调函数 def process_spider_exception(_failure): # 错了才会执行这里 exception = _failure.value for method in self.methods['process_spider_exception']: result = method(response=response, exception=exception, spider=spider) assert result is None or _isiterable(result), \ 'Middleware %s must returns None, or an iterable object, got %s ' % \ (fname(method), type(result)) if result is not None: return result return _failure def process_spider_output(result): for method in self.methods['process_spider_output']: result = method(response=response, result=result, spider=spider) assert _isiterable(result), \ 'Middleware %s must returns an iterable object, got %s ' % \ (fname(method), type(result)) return result dfd = mustbe_deferred(process_spider_input, response) # 对进来的数据做一次处理???怎么还有这种操作 dfd.addErrback(process_spider_exception) dfd.addCallback(process_spider_output) return dfd
def scrape_response(self, scrape_func: ScrapeFunc, response: Response, request: Request, spider: Spider) -> Deferred: def process_callback_output(result: Iterable) -> MutableChain: return self._process_callback_output(response, spider, result) def process_spider_exception( _failure: Failure) -> Union[Failure, MutableChain]: return self._process_spider_exception(response, spider, _failure) dfd = mustbe_deferred(self._process_spider_input, scrape_func, response, request, spider) dfd.addCallbacks(callback=process_callback_output, errback=process_spider_exception) return dfd
def _on_success(response): """handle the result of a page download""" assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received log.msg(self._crawled_logline(request, response), \ level=log.DEBUG, spider=spider) return response elif isinstance(response, Request): newrequest = response dfd = mustbe_deferred(self.schedule, newrequest, spider) if newrequest.callback: # XXX: this is a bit hacky and should be removed dfd.addCallbacks(newrequest.callback, newrequest.errback) return dfd
def crawl(self, request, spider): if spider in self.closing: # ignore requests for spiders being closed return assert spider in self.open_spiders, \ "Spider %r not opened when crawling: %s" % (spider.name, request) schd = mustbe_deferred(self.schedule, request, spider) # FIXME: we can't log errors because we would be preventing them from # propagating to the request errback. This should be fixed after the # next core refactoring. #schd.addErrback(log.err, "Error on engine.crawl()") schd.addBoth(self.scraper.enqueue_scrape, request, spider) schd.addErrback(log.err, "Unhandled error on engine.crawl()", spider=spider) schd.addBoth(lambda _: self.next_request(spider))
def download(self, download_func, request, spider): @defer.inlineCallbacks def process_request(request): for method in self.methods['process_request']: # splash request , dont process again by wsy if request.meta.get("_splash_processed"): break response = yield method(request=request, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_request must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, response.__class__.__name__) if response: defer.returnValue(response) defer.returnValue((yield download_func(request=request,spider=spider))) @defer.inlineCallbacks def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): defer.returnValue(response) for method in self.methods['process_response']: response = yield method(request=request, response=response, spider=spider) assert isinstance(response, (Response, Request)), \ 'Middleware %s.process_response must return Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response)) if isinstance(response, Request): defer.returnValue(response) defer.returnValue(response) @defer.inlineCallbacks def process_exception(_failure): exception = _failure.value for method in self.methods['process_exception']: response = yield method(request=request, exception=exception, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_exception must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response)) if response: defer.returnValue(response) defer.returnValue(_failure) deferred = mustbe_deferred(process_request, request) deferred.addErrback(process_exception) deferred.addCallback(process_response) return deferred
def _check_media_to_download(self, result, request, info): if result is not None: return result if self.download_func: # this ugly code was left only to support tests. TODO: remove dfd = mustbe_deferred(self.download_func, request, info.spider) dfd.addCallbacks( callback=self.media_downloaded, callbackArgs=(request, info), errback=self.media_failed, errbackArgs=(request, info)) else: request.meta['handle_httpstatus_all'] = True dfd = self.crawler.engine.download(request, info.spider) dfd.addCallbacks( callback=self.media_downloaded, callbackArgs=(request, info), errback=self.media_failed, errbackArgs=(request, info)) return dfd
def _check_media_to_download(self, result, request, info): if result is not None: return result if self.download_func: # this ugly code was left only to support tests. TODO: remove dfd = mustbe_deferred(self.download_func, request, info.spider) dfd.addCallbacks( callback=self.media_downloaded, callbackArgs=(request, info), errback=self.media_failed, errbackArgs=(request, info)) else: self._modify_media_request(request) dfd = self.crawler.engine.download(request, info.spider) dfd.addCallbacks( callback=self.media_downloaded, callbackArgs=(request, info), errback=self.media_failed, errbackArgs=(request, info)) return dfd
def _download(self, site, request, spider): # The order is very important for the following deferreds. Do not change! # 1. Create the download deferred dfd = mustbe_deferred(self.handlers.download_request, request, spider) # 2. After response arrives, remove the request from transferring # state to free up the transferring slot so it can be used by the # following requests (perhaps those which came from the downloader # middleware itself) site.transferring.add(request) def finish_transferring(_): site.transferring.remove(request) self._process_queue(spider) return _ return dfd.addBoth(finish_transferring)
def download(self, download_func, request, spider): @defer.inlineCallbacks def process_request(request): for method in self.methods["process_request"]: response = yield method(request=request, spider=spider) assert response is None or isinstance(response, (Response, Request)), ( "Middleware %s.process_request must return None, Response or Request, got %s" % (six.get_method_self(method).__class__.__name__, response.__class__.__name__) ) if response: defer.returnValue(response) defer.returnValue((yield download_func(request=request, spider=spider))) @defer.inlineCallbacks def process_response(response): assert response is not None, "Received None in process_response" if isinstance(response, Request): defer.returnValue(response) for method in self.methods["process_response"]: response = yield method(request=request, response=response, spider=spider) assert isinstance(response, (Response, Request)), ( "Middleware %s.process_response must return Response or Request, got %s" % (six.get_method_self(method).__class__.__name__, type(response)) ) if isinstance(response, Request): defer.returnValue(response) defer.returnValue(response) @defer.inlineCallbacks def process_exception(_failure): exception = _failure.value for method in self.methods["process_exception"]: response = yield method(request=request, exception=exception, spider=spider) assert response is None or isinstance(response, (Response, Request)), ( "Middleware %s.process_exception must return None, Response or Request, got %s" % (six.get_method_self(method).__class__.__name__, type(response)) ) if response: defer.returnValue(response) defer.returnValue(_failure) deferred = mustbe_deferred(process_request, request) deferred.addErrback(process_exception) deferred.addCallback(process_response) return deferred
def download(self, download_func, request, spider): @defer.inlineCallbacks def process_request(request): for method in self.methods['process_request']: response = yield method(request=request, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_request must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, response.__class__.__name__) # 规定返回类型,None,response,request这三种 if response: # 如果是空,则会继续处理所有的中间件,但是一旦不是了,就会提前跑路,对,应该就是这个意思,可以很骚 defer.returnValue(response) defer.returnValue((yield download_func(request=request,spider=spider))) @defer.inlineCallbacks def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): defer.returnValue(response) for method in self.methods['process_response']: response = yield method(request=request, response=response, spider=spider) assert isinstance(response, (Response, Request)), \ 'Middleware %s.process_response must return Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response)) if isinstance(response, Request): defer.returnValue(response) defer.returnValue(response) @defer.inlineCallbacks def process_exception(_failure): exception = _failure.value for method in self.methods['process_exception']: response = yield method(request=request, exception=exception, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_exception must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response)) if response: defer.returnValue(response) defer.returnValue(_failure) deferred = mustbe_deferred(process_request, request) # 执行顺序,现执行request,再执行exception,最后才是response deferred.addErrback(process_exception) deferred.addCallback(process_response) return deferred
def _download(self, slot, request, spider): # The order is very important for the following deferreds. Do not change! # 1. Create the download deferred dfd = mustbe_deferred(self.handlers.download_request, request, spider) # 2. After response arrives, remove the request from transferring # state to free up the transferring slot so it can be used by the # following requests (perhaps those which came from the downloader # middleware itself) slot.transferring.add(request) def finish_transferring(_): slot.transferring.remove(request) self._process_queue(spider, slot) return _ return dfd.addBoth(finish_transferring)
def scrape_response(self, scrape_func: ScrapeFunc, response: Response, request: Request, spider: Spider) -> Deferred: async def process_callback_output( result: Union[Iterable, AsyncIterable] ) -> Union[MutableChain, MutableAsyncChain]: return await self._process_callback_output(response, spider, result) def process_spider_exception( _failure: Failure) -> Union[Failure, MutableChain]: return self._process_spider_exception(response, spider, _failure) dfd = mustbe_deferred(self._process_spider_input, scrape_func, response, request, spider) dfd.addCallbacks( callback=deferred_f_from_coro_f(process_callback_output), errback=process_spider_exception) return dfd
def _download(self, slot, request, spider): # The order is very important for the following deferreds. Do not change! request.meta['the_id_from_scrapy'] = 0 # 1. Create the download deferred def download_request(request, spider): starttime = str(datetime.datetime.now()) request.meta['the_id_from_scrapy'] = self.write_database_start(spider.name, starttime) return self.handlers[spider.name].download_request(request, spider) dfd = mustbe_deferred(download_request, request, spider) # 2. Notify response_downloaded listeners about the recent download # before querying queue for next request def _downloaded(response): size = len(str(response.body)) self.write_database_end(request.meta['the_id_from_scrapy'], str(datetime.datetime.now()), size) self.signals[spider.name].send_catch_log(signal=signals.response_downloaded, response=response, request=request, spider=spider) return response dfd.addCallback(_downloaded) def _err_handle(response): self.delete_bad_with_id(request.meta['the_id_from_scrapy']) return response dfd.addErrback(_err_handle) # 3. After response arrives, remove the request from transferring # state to free up the transferring slot so it can be used by the # following requests (perhaps those which came from the downloader # middleware itself) slot.transferring.add(request) def finish_transferring(_): slot.transferring.remove(request) self._process_queue(spider, slot) return _ return dfd.addBoth(finish_transferring)
def download(self, download_func, request, spider): def process_request(request): for method in self.methods['process_request']: response = method(request=request, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_request must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, response.__class__.__name__) if response: return response return download_func(request=request, spider=spider) def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): return response for method in self.methods['process_response']: response = method(request=request, response=response, spider=spider) assert isinstance(response, (Response, Request)), \ 'Middleware %s.process_response must return Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response)) if isinstance(response, Request): return response return response def process_exception(_failure): exception = _failure.value for method in self.methods['process_exception']: response = method(request=request, exception=exception, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_exception must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response)) if response: return response return _failure deferred = mustbe_deferred(process_request, request) deferred.addErrback(process_exception) deferred.addCallback(process_response) return deferred
def download(self, download_func, request, spider): def process_request(request): for method in self.request_middleware: response = method(request=request, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_request must return None, Response or Request, got %s' % \ (method.im_self.__class__.__name__, response.__class__.__name__) if response: return response return download_func(request=request, spider=spider) def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): return response for method in self.response_middleware: response = method(request=request, response=response, spider=spider) assert isinstance(response, (Response, Request)), \ 'Middleware %s.process_response must return Response or Request, got %s' % \ (method.im_self.__class__.__name__, type(response)) if isinstance(response, Request): send_catch_log(signal=signals.response_received, \ sender=self.__class__, response=response, spider=spider) return response send_catch_log(signal=signals.response_received, sender=self.__class__, \ response=response, spider=spider) return response def process_exception(_failure): exception = _failure.value for method in self.exception_middleware: response = method(request=request, exception=exception, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_exception must return None, Response or Request, got %s' % \ (method.im_self.__class__.__name__, type(response)) if response: return response return _failure deferred = mustbe_deferred(process_request, request) deferred.addErrback(process_exception) deferred.addCallback(process_response) return deferred
def process_pokemon_request(self, request, info): fingerprint = request_fingerprint(request) callback = request.callback or (lambda _: _) errorback = request.errback request.callback = None request.errback = None # Otherwise, wait for result wad = Deferred().addCallbacks(callback, errorback) info.waiting[fingerprint].append(wad) info.downloading.add(fingerprint) dfd = mustbe_deferred(self.media_to_download, request, info) dfd.addCallback(self._check_media_to_download, request, info) dfd.addBoth(self._cache_result_and_execute_waiters, fingerprint, info) dfd.addErrback(lambda f: logger.error(f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider})) return dfd.addBoth(lambda _: wad) # it must return wad at last
def scrape_response(self, scrape_func, response, request, spider): fname = lambda f: '%s.%s' % (f.im_self.__class__.__name__, f.im_func. __name__) def process_spider_input(response): for method in self.methods['process_spider_input']: try: result = method(response=response, spider=spider) assert result is None, \ 'Middleware %s must returns None or ' \ 'raise an exception, got %s ' \ % (fname(method), type(result)) except: return scrape_func(Failure(), request, spider) return scrape_func(response, request, spider) def process_spider_exception(_failure): exception = _failure.value for method in self.methods['process_spider_exception']: result = method(response=response, exception=exception, spider=spider) assert result is None or _isiterable(result), \ 'Middleware %s must returns None, or an iterable object, got %s ' % \ (fname(method), type(result)) if result is not None: return result return _failure def process_spider_output(result): for method in self.methods['process_spider_output']: result = method(response=response, result=result, spider=spider) assert _isiterable(result), \ 'Middleware %s must returns an iterable object, got %s ' % \ (fname(method), type(result)) return result dfd = mustbe_deferred(process_spider_input, response) dfd.addErrback(process_spider_exception) dfd.addCallback(process_spider_output) return dfd
def download(self, download_func, request, spider): def process_request(request): for method in self.methods['process_request']: response = method(request=request, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_request must return None, Response or Request, got %s' % \ (method.im_self.__class__.__name__, response.__class__.__name__) if response: return response return download_func(request=request, spider=spider) def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): return response for method in self.methods['process_response']: try: response = method(request=request, response=response, spider=spider) except: return process_exception(Failure()) assert isinstance(response, (Response, Request)), \ 'Middleware %s.process_response must return Response or Request, got %s' % \ (method.im_self.__class__.__name__, type(response)) if isinstance(response, Request): return response return response def process_exception(_failure): exception = _failure.value for method in self.methods['process_exception']: response = method(request=request, exception=exception, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_exception must return None, Response or Request, got %s' % \ (method.im_self.__class__.__name__, type(response)) if response: return response return _failure deferred = mustbe_deferred(process_request, request) deferred.addErrback(process_exception) deferred.addCallback(process_response) return deferred
def download(self, request, spider): def _on_success(response): """handle the result of a page download""" assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received log.msg(self._crawled_logline(request, response), \ level=log.DEBUG, spider=spider) return response elif isinstance(response, Request): newrequest = response dfd = mustbe_deferred(self.schedule, newrequest, spider) if newrequest.callback: # XXX: this is a bit hacky and should be removed dfd.addCallbacks(newrequest.callback, newrequest.errback) return dfd def _on_error(_failure): """handle an error processing a page""" exc = _failure.value if isinstance(exc, IgnoreRequest): errmsg = _failure.getErrorMessage() level = exc.level else: errmsg = str(_failure) level = log.ERROR if errmsg: log.msg("Crawling <%s>: %s" % (request.url, errmsg), \ level=level, spider=spider) return Failure(IgnoreRequest(str(exc))) def _on_complete(_): self.next_request(spider) return _ if spider not in self.downloader.sites: return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete) dwld = mustbe_deferred(self.downloader.fetch, request, spider) dwld.addCallbacks(_on_success, _on_error) dwld.addBoth(_on_complete) return dwld
def scrape_response(self, scrape_func, response, request, spider): fname = lambda f: '%s.%s' % ( f.im_self.__class__.__name__, f.im_func.__name__) def process_spider_input(response): for method in self.methods['process_spider_input']: try: result = method(response=response, spider=spider) assert result is None, \ 'Middleware %s must returns None or ' \ 'raise an exception, got %s ' \ % (fname(method), type(result)) except: return scrape_func(Failure(), request, spider) return scrape_func(response, request, spider) def process_spider_exception(_failure): exception = _failure.value for method in self.methods['process_spider_exception']: result = method( response=response, exception=exception, spider=spider) assert result is None or _isiterable(result), \ 'Middleware %s must returns None, or an iterable object, got %s ' % \ (fname(method), type(result)) if result is not None: return result return _failure def process_spider_output(result): for method in self.methods['process_spider_output']: result = method( response=response, result=result, spider=spider) assert _isiterable(result), \ 'Middleware %s must returns an iterable object, got %s ' % \ (fname(method), type(result)) return result dfd = mustbe_deferred(process_spider_input, response) dfd.addErrback(process_spider_exception) dfd.addCallback(process_spider_output) return dfd
def _download(self, request, spider): slot = self.slots[spider] slot.add_request(request) def _on_success(response): """handle the result of a page download""" assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received log.msg(log.formatter.crawled(request, response, spider), \ level=log.DEBUG, spider=spider) send_catch_log(signal=signals.response_received, \ response=response, request=request, spider=spider) return response def _on_complete(_): self.next_request(spider) return _ dwld = mustbe_deferred(self.downloader.fetch, request, spider) dwld.addCallback(_on_success) dwld.addBoth(_on_complete) return dwld
def _download(self, site, request, spider): # The order is very important for the following deferreds. Do not change! # 1. Create the download deferred dfd = mustbe_deferred(download_any, request, spider) # 2. After response arrives, remove the request from transferring # state to free up the transferring slot so it can be used by the # following requests (perhaps those which came from the downloader # middleware itself) site.transferring.add(request) def finish_transferring(_): site.transferring.remove(request) self._process_queue(spider) # avoid partially downloaded responses from propagating to the # downloader middleware, to speed-up the closing process if site.closing: log.msg("Crawled while closing spider: %s" % request, \ level=log.DEBUG, spider=spider) raise IgnoreRequest return _ return dfd.addBoth(finish_transferring)
def _download(self, slot, request, spider): # The order is very important for the following deferreds. Do not change! # 1. Create the download deferred dfd = mustbe_deferred(self.handlers.download_request, request, spider) # 2. Notify response_downloaded listeners about the recent download # before querying queue for next request # 在查询队列中的下一个请求之前,通知response_downloaded的侦听器有关最近的下载的信息 def _downloaded(response): # s.response_downloaded 正在获取响应的位置 self.signals.send_catch_log(signal=signals.response_downloaded, response=response, request=request, spider=spider) return response dfd.addCallback(_downloaded) # 3. After response arrives, remove the request from transferring # state to free up the transferring slot so it can be used by the # following requests (perhaps those which came from the downloader # middleware itself) # 响应到达后,将请求从传输状态中删除以释放传输插槽,以便随后的请求可以使用它(也许来自下载器中间件本身的请求) slot.transferring.add(request) def finish_transferring(_): slot.transferring.remove(request) self._process_queue(spider, slot) # s.request_left_downloader 刚离开下载器 self.signals.send_catch_log(signal=signals.request_left_downloader, request=request, spider=spider) return _ return dfd.addBoth(finish_transferring)
def _download(self, site, request, spider): # The order is very important for the following deferreds. Do not change! # 1. Create the download deferred dfd = mustbe_deferred(self.handlers.download_request, request, spider) # 2. After response arrives, remove the request from transferring # state to free up the transferring slot so it can be used by the # following requests (perhaps those which came from the downloader # middleware itself) site.transferring.add(request) def finish_transferring(_): site.transferring.remove(request) self._process_queue(spider) # avoid partially downloaded responses from propagating to the # downloader middleware, to speed-up the closing process if site.closing: log.msg("Crawled while closing spider: %s" % request, \ level=log.DEBUG, spider=spider) raise IgnoreRequest return _ return dfd.addBoth(finish_transferring)
def scrape_response(self, scrape_func, response, request, spider): #scrapy_func是什么东西?? fname = lambda f:'%s.%s' % (f.im_self.__class__.__name__, f.im_func.__name__) #fname(f):传入f #返回f.im_self.__class__.__name__ 和f.im_func.__name__连接的字符串。 def process_spider_input(response): #依次调用methods中存入的,所有中间件中的process_spider_input方法。 for method in self.methods['process_spider_input']: try: result = method(response=response, spider=spider) #每一个中间件处理input完成的result没有记录或者处理 #process_spider_inpurt1,2,3,4,5,6...(response,spider) assert result is None, \ 'Middleware %s must returns None or ' \ 'raise an exception, got %s ' \ % ( fname(method), type( result ) ) #如果result为None,则表示调用成功。 #猜测:可能repsonse在process中已经被修改了某些部分, #所以没有必要记录返回的result。 except: return scrape_func(Failure(), request, spider) #猜测:如果有一个中间件执行错误。应该是调用错误回调。 #猜测:Failure()实例作为input(或者result)返回到下一错误链。 return scrape_func(response, request, spider) #猜测:如果执行正确,则以response为input(或者result)返回到回调链。 def process_spider_exception(_failure): exception = _failure.value #exception for method in self.methods['process_spider_exception']: result = method(response=response, exception=exception, spider=spider) assert result is None or _isiterable(result), \ 'Middleware %s must returns None, or an iterable object, got %s ' % \ (fname(method), type(result)) #如果不是None,就从链中断开,直接返回result。 #猜测:这个result会不会返回到callback?? if result is not None: return result return _failure def process_spider_output(result): for method in self.methods['process_spider_output']: result = method(response=response, result=result, spider=spider) assert _isiterable(result), \ 'Middleware %s must returns an iterable object, got %s ' % \ (fname(method), type(result)) return result dfd = mustbe_deferred(process_spider_input, response) #mustbe_deffered:首先调用result = process_spider_input(response) #出错就一会调用errback #result没问题则调用下一轮处理result的延迟。 dfd.addErrback(process_spider_exception) #添加errback dfd.addCallback(process_spider_output) #添加callback return dfd
def scrape_response(self, scrape_func, response, request, spider): fname = lambda f:'%s.%s' % ( six.get_method_self(f).__class__.__name__, six.get_method_function(f).__name__) def process_spider_input(response): for method in self.methods['process_spider_input']: try: result = method(response=response, spider=spider) if result is not None: raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \ .format(fname(method), type(result))) except _InvalidOutput: raise except Exception: return scrape_func(Failure(), request, spider) return scrape_func(response, request, spider) def process_spider_exception(_failure, start_index=0): exception = _failure.value # don't handle _InvalidOutput exception if isinstance(exception, _InvalidOutput): return _failure method_list = islice(self.methods['process_spider_exception'], start_index, None) for method_index, method in enumerate(method_list, start=start_index): if method is None: continue result = method(response=response, exception=exception, spider=spider) if _isiterable(result): # stop exception handling by handing control over to the # process_spider_output chain if an iterable has been returned return process_spider_output(result, method_index+1) elif result is None: continue else: raise _InvalidOutput('Middleware {} must return None or an iterable, got {}' \ .format(fname(method), type(result))) return _failure def process_spider_output(result, start_index=0): # items in this iterable do not need to go through the process_spider_output # chain, they went through it already from the process_spider_exception method recovered = MutableChain() def evaluate_iterable(iterable, index): try: for r in iterable: yield r except Exception as ex: exception_result = process_spider_exception(Failure(ex), index+1) if isinstance(exception_result, Failure): raise recovered.extend(exception_result) method_list = islice(self.methods['process_spider_output'], start_index, None) for method_index, method in enumerate(method_list, start=start_index): if method is None: continue # the following might fail directly if the output value is not a generator try: result = method(response=response, result=result, spider=spider) except Exception as ex: exception_result = process_spider_exception(Failure(ex), method_index+1) if isinstance(exception_result, Failure): raise return exception_result if _isiterable(result): result = evaluate_iterable(result, method_index) else: raise _InvalidOutput('Middleware {} must return an iterable, got {}' \ .format(fname(method), type(result))) return chain(result, recovered) dfd = mustbe_deferred(process_spider_input, response) dfd.addCallbacks(callback=process_spider_output, errback=process_spider_exception) return dfd
def fetch(self, url): log.debug("Fetch URL %s" % url) request = Request(url=url) self.process_request(request) return mustbe_deferred(self.downloader.download_request, request, None)