def _process_request(self, request, info): fp = request_fingerprint(request) cb = request.callback or (lambda _: _) eb = request.errback request.callback = None request.errback = None # Return cached result if request was already seen if fp in info.downloaded: return defer_result(info.downloaded[fp]).addCallbacks(cb, eb) # Otherwise, wait for result wad = Deferred().addCallbacks(cb, eb) info.waiting[fp].append(wad) # Check if request is downloading right now to avoid doing it twice if fp in info.downloading: return wad # Download request checking media_to_download hook output first info.downloading.add(fp) dfd = mustbe_deferred(self.media_to_download, request, info) dfd.addCallback(self._check_media_to_download, request, info) dfd.addBoth(self._cache_result_and_execute_waiters, fp, info) dfd.addErrback(log.spider_log, spider=info.spider) return dfd.addBoth(lambda _: wad) # it must return wad at last
def _download(self, slot, request, spider): # The order is very important for the following deferreds. Do not change! # 1. Create the download deferred dfd = mustbe_deferred(self.handlers.download_request, request, spider) # 2. Notify response_downloaded listeners about the recent download # before querying queue for next request def _downloaded(response): self.signals.send_catch_log(signal=signals.response_downloaded, response=response, request=request, spider=spider) return response dfd.addCallback(_downloaded) # 3. After response arrives, remove the request from transferring # state to free up the transferring slot so it can be used by the # following requests (perhaps those which came from the downloader # middleware itself) slot.transferring.add(request) def finish_transferring(_): slot.transferring.remove(request) self._process_queue(spider, slot) return _ return dfd.addBoth(finish_transferring)
def _check_media_to_download(self, result, request, info): if result is not None: return result if self.download_func: # this ugly code was left only to support tests. TODO: remove dfd = mustbe_deferred(self.download_func, request, info.spider) dfd.addCallbacks(callback=self.media_downloaded, callbackArgs=(request, info), errback=self.media_failed, errbackArgs=(request, info)) else: request.meta['handle_httpstatus_all'] = True dfd = self.crawler.engine.download(request, info.spider) dfd.addCallbacks(callback=self.media_downloaded, callbackArgs=(request, info), errback=self.media_failed, errbackArgs=(request, info)) return dfd
def download(self, download_func, request, spider): def process_request(request): for method in self.methods['process_request']: response = method(request=request, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_request must return None, Response or Request, got %s' % \ (method.im_self.__class__.__name__, response.__class__.__name__) if response: return response return download_func(request=request, spider=spider) def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): return response for method in self.methods['process_response']: response = method(request=request, response=response, spider=spider) assert isinstance(response, (Response, Request)), \ 'Middleware %s.process_response must return Response or Request, got %s' % \ (method.im_self.__class__.__name__, type(response)) if isinstance(response, Request): return response return response def process_exception(_failure): exception = _failure.value for method in self.methods['process_exception']: response = method(request=request, exception=exception, spider=spider) assert response is None or isinstance(response, (Response, Request)), \ 'Middleware %s.process_exception must return None, Response or Request, got %s' % \ (method.im_self.__class__.__name__, type(response)) if response: return response return _failure deferred = mustbe_deferred(process_request, request) deferred.addErrback(process_exception) deferred.addCallback(process_response) return deferred
def scrape_response(self, scrape_func, response, request, spider): fname = lambda f: '%s.%s' % (f.im_self.__class__.__name__, f.im_func. __name__) def process_spider_input(response): for method in self.methods['process_spider_input']: try: result = method(response=response, spider=spider) assert result is None, \ 'Middleware %s must returns None or ' \ 'raise an exception, got %s ' \ % (fname(method), type(result)) except: return scrape_func(Failure(), request, spider) return scrape_func(response, request, spider) def process_spider_exception(_failure): exception = _failure.value for method in self.methods['process_spider_exception']: result = method(response=response, exception=exception, spider=spider) assert result is None or _isiterable(result), \ 'Middleware %s must returns None, or an iterable object, got %s ' % \ (fname(method), type(result)) if result is not None: return result return _failure def process_spider_output(result): for method in self.methods['process_spider_output']: result = method(response=response, result=result, spider=spider) assert _isiterable(result), \ 'Middleware %s must returns an iterable object, got %s ' % \ (fname(method), type(result)) return result dfd = mustbe_deferred(process_spider_input, response) dfd.addErrback(process_spider_exception) dfd.addCallback(process_spider_output) return dfd