def iterate_spider_output(result): if collect_asyncgen and hasattr( inspect, 'isasyncgen') and inspect.isasyncgen(result): d = deferred_from_coro(collect_asyncgen(result)) d.addCallback(iterate_spider_output) return d return arg_to_iter(deferred_from_coro(result))
def close(self) -> Deferred: yield super().close() if getattr(self, "context", None): logger.info("Closing browser context") yield deferred_from_coro(self.context.close()) if getattr(self, "browser", None): logger.info("Closing browser") yield deferred_from_coro(self.browser.close()) yield deferred_from_coro(self.playwright_context_manager.__aexit__())
def iterate_spider_output(result): if inspect.isasyncgen(result): return result elif inspect.iscoroutine(result): d = deferred_from_coro(result) d.addCallback(iterate_spider_output) return d else: return arg_to_iter(deferred_from_coro(result))
def process_request(request): for method in self.methods['process_request']: response = yield deferred_from_coro(method(request=request, spider=spider)) if response is not None and not isinstance(response, (Response, Request)): raise _InvalidOutput( "Middleware %s.process_request must return None, Response or Request, got %s" % (method.__self__.__class__.__name__, response.__class__.__name__) ) if response: return response return (yield download_func(request=request, spider=spider))
def _test_asyncgen_base(self, *mw_classes, downgrade: bool = False, start_index: Optional[int] = None): with LogCapture() as log: result = yield self._get_middleware_result(*mw_classes, start_index=start_index) self.assertIsInstance(result, collections.abc.AsyncIterator) result_list = yield deferred_from_coro(collect_asyncgen(result)) self.assertEqual(len(result_list), self.RESULT_COUNT) self.assertIsInstance(result_list[0], self.ITEM_TYPE) self.assertEqual("downgraded to a non-async" in str(log), downgrade)
def process_exception(failure): exception = failure.value for method in self.methods['process_exception']: response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider)) if response is not None and not isinstance(response, (Response, Request)): raise _InvalidOutput( "Middleware %s.process_exception must return None, Response or Request, got %s" % (method.__self__.__class__.__name__, type(response)) ) if response: return response return failure
def process_exception(failure): exception = failure.value for method in self.methods['process_exception']: response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider)) if response is not None and not isinstance(response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__qualname__} must return None, Response or " f"Request, got {type(response)}" ) if response: return response return failure
def process_request(request: Request): for method in self.methods['process_request']: method = cast(Callable, method) response = yield deferred_from_coro( method(request=request, spider=spider)) if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__qualname__} must return None, Response or " f"Request, got {response.__class__.__name__}") if response: return response return (yield download_func(request=request, spider=spider))
def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): defer.returnValue(response) for method in self.methods['process_response']: response = yield deferred_from_coro( method(request=request, response=response, spider=spider)) if not isinstance(response, (Response, Request)): raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \ (method.__self__.__class__.__name__, type(response))) if isinstance(response, Request): defer.returnValue(response) defer.returnValue(response)
def process_request(request): for method in self.methods['process_request']: response = yield deferred_from_coro( method(request=request, spider=spider) ) # 这个deferred_from_coro方法是将 middleware的方法 从 asyncio 转化为 recator方法 并yield出去 if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__self__.__class__.__name__}" ".process_request must return None, Response or " f"Request, got {response.__class__.__name__}") if response: return response return (yield download_func(request=request, spider=spider))
def process_response(response): if response is None: raise TypeError("Received None in process_response") elif isinstance(response, Request): return response for method in self.methods['process_response']: response = yield deferred_from_coro(method(request=request, response=response, spider=spider)) if not isinstance(response, (Response, Request)): raise _InvalidOutput( "Middleware %s.process_response must return Response or Request, got %s" % (method.__self__.__class__.__name__, type(response)) ) if isinstance(response, Request): return response return response
def process_response(response: Union[Response, Request]): if response is None: raise TypeError("Received None in process_response") elif isinstance(response, Request): return response for method in self.methods['process_response']: response = yield deferred_from_coro( method(request=request, response=response, spider=spider)) if not isinstance(response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__qualname__} must return Response or Request, " f"got {type(response)}") if isinstance(response, Request): return response return response
def close(self) -> Deferred: yield super().close() if self.browser: yield deferred_from_coro(self.browser.close())
def download_request(self, request: Request, spider: Spider) -> Deferred: if request.meta.get("pyppeteer"): return deferred_from_coro(self._download_request(request, spider)) return super().download_request(request, spider)
def _engine_started_handler(self) -> Deferred: return deferred_from_coro(self._launch_browser())
def start(self): self.serv_deferred = deferred_from_coro(self.server.serve())
def _engine_started(self) -> Deferred: logger.info("Launching browser") return deferred_from_coro(self._launch_browser())
def close(self) -> Deferred: yield super().close() yield deferred_from_coro(self._close())
def iterate_spider_output(result): return arg_to_iter(deferred_from_coro(result))
def open_spider(self, spider): return deferred_from_coro(self._open_spider(spider))
def _engine_started(self) -> Deferred: """Launch the browser. Use the engine_started signal as it supports returning deferreds.""" return deferred_from_coro(self._launch_browser())
def _process_spider_output(self, response: Response, spider: Spider, result: Union[Iterable, AsyncIterable], start_index: int = 0) -> Deferred: # items in this iterable do not need to go through the process_spider_output # chain, they went through it already from the process_spider_exception method recovered: Union[MutableChain, MutableAsyncChain] last_result_is_async = isinstance(result, AsyncIterable) if last_result_is_async: recovered = MutableAsyncChain() else: recovered = MutableChain() # There are three cases for the middleware: def foo, async def foo, def foo + async def foo_async. # 1. def foo. Sync iterables are passed as is, async ones are downgraded. # 2. async def foo. Sync iterables are upgraded, async ones are passed as is. # 3. def foo + async def foo_async. Iterables are passed to the respective method. # Storing methods and method tuples in the same list is weird but we should be able to roll this back # when we drop this compatibility feature. method_list = islice(self.methods['process_spider_output'], start_index, None) for method_index, method_pair in enumerate(method_list, start=start_index): if method_pair is None: continue need_upgrade = need_downgrade = False if isinstance(method_pair, tuple): # This tuple handling is only needed until _async compatibility methods are removed. method_sync, method_async = method_pair method = method_async if last_result_is_async else method_sync else: method = method_pair if not last_result_is_async and isasyncgenfunction(method): need_upgrade = True elif last_result_is_async and not isasyncgenfunction(method): need_downgrade = True try: if need_upgrade: # Iterable -> AsyncIterable result = as_async_generator(result) elif need_downgrade: if not self.downgrade_warning_done: logger.warning( f"Async iterable passed to {method.__qualname__} " f"was downgraded to a non-async one") self.downgrade_warning_done = True assert isinstance(result, AsyncIterable) # AsyncIterable -> Iterable result = yield deferred_from_coro(collect_asyncgen(result)) if isinstance(recovered, AsyncIterable): recovered_collected = yield deferred_from_coro( collect_asyncgen(recovered)) recovered = MutableChain(recovered_collected) # might fail directly if the output value is not a generator result = method(response=response, result=result, spider=spider) except Exception as ex: exception_result = self._process_spider_exception( response, spider, Failure(ex), method_index + 1) if isinstance(exception_result, Failure): raise return exception_result if _isiterable(result): result = self._evaluate_iterable(response, spider, result, method_index + 1, recovered) else: if iscoroutine(result): result.close() # Silence warning about not awaiting msg = (f"{method.__qualname__} must be an asynchronous " f"generator (i.e. use yield)") else: msg = ( f"{method.__qualname__} must return an iterable, got " f"{type(result)}") raise _InvalidOutput(msg) last_result_is_async = isinstance(result, AsyncIterable) if last_result_is_async: return MutableAsyncChain(result, recovered) else: return MutableChain(result, recovered) # type: ignore[arg-type]