Esempio n. 1
0
def iterate_spider_output(result):
    if collect_asyncgen and hasattr(
            inspect, 'isasyncgen') and inspect.isasyncgen(result):
        d = deferred_from_coro(collect_asyncgen(result))
        d.addCallback(iterate_spider_output)
        return d
    return arg_to_iter(deferred_from_coro(result))
Esempio n. 2
0
 def close(self) -> Deferred:
     yield super().close()
     if getattr(self, "context", None):
         logger.info("Closing browser context")
         yield deferred_from_coro(self.context.close())
     if getattr(self, "browser", None):
         logger.info("Closing browser")
         yield deferred_from_coro(self.browser.close())
     yield deferred_from_coro(self.playwright_context_manager.__aexit__())
Esempio n. 3
0
def iterate_spider_output(result):
    if inspect.isasyncgen(result):
        return result
    elif inspect.iscoroutine(result):
        d = deferred_from_coro(result)
        d.addCallback(iterate_spider_output)
        return d
    else:
        return arg_to_iter(deferred_from_coro(result))
Esempio n. 4
0
 def process_request(request):
     for method in self.methods['process_request']:
         response = yield deferred_from_coro(method(request=request, spider=spider))
         if response is not None and not isinstance(response, (Response, Request)):
             raise _InvalidOutput(
                 "Middleware %s.process_request must return None, Response or Request, got %s"
                 % (method.__self__.__class__.__name__, response.__class__.__name__)
             )
         if response:
             return response
     return (yield download_func(request=request, spider=spider))
Esempio n. 5
0
 def _test_asyncgen_base(self,
                         *mw_classes,
                         downgrade: bool = False,
                         start_index: Optional[int] = None):
     with LogCapture() as log:
         result = yield self._get_middleware_result(*mw_classes,
                                                    start_index=start_index)
     self.assertIsInstance(result, collections.abc.AsyncIterator)
     result_list = yield deferred_from_coro(collect_asyncgen(result))
     self.assertEqual(len(result_list), self.RESULT_COUNT)
     self.assertIsInstance(result_list[0], self.ITEM_TYPE)
     self.assertEqual("downgraded to a non-async" in str(log), downgrade)
Esempio n. 6
0
 def process_exception(failure):
     exception = failure.value
     for method in self.methods['process_exception']:
         response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
         if response is not None and not isinstance(response, (Response, Request)):
             raise _InvalidOutput(
                 "Middleware %s.process_exception must return None, Response or Request, got %s"
                 % (method.__self__.__class__.__name__, type(response))
             )
         if response:
             return response
     return failure
Esempio n. 7
0
 def process_exception(failure):
     exception = failure.value
     for method in self.methods['process_exception']:
         response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
         if response is not None and not isinstance(response, (Response, Request)):
             raise _InvalidOutput(
                 f"Middleware {method.__qualname__} must return None, Response or "
                 f"Request, got {type(response)}"
             )
         if response:
             return response
     return failure
Esempio n. 8
0
 def process_request(request: Request):
     for method in self.methods['process_request']:
         method = cast(Callable, method)
         response = yield deferred_from_coro(
             method(request=request, spider=spider))
         if response is not None and not isinstance(
                 response, (Response, Request)):
             raise _InvalidOutput(
                 f"Middleware {method.__qualname__} must return None, Response or "
                 f"Request, got {response.__class__.__name__}")
         if response:
             return response
     return (yield download_func(request=request, spider=spider))
Esempio n. 9
0
        def process_response(response):
            assert response is not None, 'Received None in process_response'
            if isinstance(response, Request):
                defer.returnValue(response)

            for method in self.methods['process_response']:
                response = yield deferred_from_coro(
                    method(request=request, response=response, spider=spider))
                if not isinstance(response, (Response, Request)):
                    raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \
                                         (method.__self__.__class__.__name__, type(response)))
                if isinstance(response, Request):
                    defer.returnValue(response)
            defer.returnValue(response)
Esempio n. 10
0
 def process_request(request):
     for method in self.methods['process_request']:
         response = yield deferred_from_coro(
             method(request=request, spider=spider)
         )  # 这个deferred_from_coro方法是将 middleware的方法 从 asyncio 转化为 recator方法 并yield出去
         if response is not None and not isinstance(
                 response, (Response, Request)):
             raise _InvalidOutput(
                 f"Middleware {method.__self__.__class__.__name__}"
                 ".process_request must return None, Response or "
                 f"Request, got {response.__class__.__name__}")
         if response:
             return response
     return (yield download_func(request=request, spider=spider))
Esempio n. 11
0
        def process_response(response):
            if response is None:
                raise TypeError("Received None in process_response")
            elif isinstance(response, Request):
                return response

            for method in self.methods['process_response']:
                response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
                if not isinstance(response, (Response, Request)):
                    raise _InvalidOutput(
                        "Middleware %s.process_response must return Response or Request, got %s"
                        % (method.__self__.__class__.__name__, type(response))
                    )
                if isinstance(response, Request):
                    return response
            return response
Esempio n. 12
0
        def process_response(response: Union[Response, Request]):
            if response is None:
                raise TypeError("Received None in process_response")
            elif isinstance(response, Request):
                return response

            for method in self.methods['process_response']:
                response = yield deferred_from_coro(
                    method(request=request, response=response, spider=spider))
                if not isinstance(response, (Response, Request)):
                    raise _InvalidOutput(
                        f"Middleware {method.__qualname__} must return Response or Request, "
                        f"got {type(response)}")
                if isinstance(response, Request):
                    return response
            return response
Esempio n. 13
0
 def close(self) -> Deferred:
     yield super().close()
     if self.browser:
         yield deferred_from_coro(self.browser.close())
Esempio n. 14
0
 def download_request(self, request: Request, spider: Spider) -> Deferred:
     if request.meta.get("pyppeteer"):
         return deferred_from_coro(self._download_request(request, spider))
     return super().download_request(request, spider)
Esempio n. 15
0
 def _engine_started_handler(self) -> Deferred:
     return deferred_from_coro(self._launch_browser())
Esempio n. 16
0
 def start(self):
     self.serv_deferred = deferred_from_coro(self.server.serve())
Esempio n. 17
0
 def _engine_started(self) -> Deferred:
     logger.info("Launching browser")
     return deferred_from_coro(self._launch_browser())
Esempio n. 18
0
 def close(self) -> Deferred:
     yield super().close()
     yield deferred_from_coro(self._close())
Esempio n. 19
0
def iterate_spider_output(result):
    return arg_to_iter(deferred_from_coro(result))
Esempio n. 20
0
 def open_spider(self, spider):
     return deferred_from_coro(self._open_spider(spider))
Esempio n. 21
0
 def _engine_started(self) -> Deferred:
     """Launch the browser. Use the engine_started signal as it supports returning deferreds."""
     return deferred_from_coro(self._launch_browser())
Esempio n. 22
0
    def _process_spider_output(self,
                               response: Response,
                               spider: Spider,
                               result: Union[Iterable, AsyncIterable],
                               start_index: int = 0) -> Deferred:
        # items in this iterable do not need to go through the process_spider_output
        # chain, they went through it already from the process_spider_exception method
        recovered: Union[MutableChain, MutableAsyncChain]
        last_result_is_async = isinstance(result, AsyncIterable)
        if last_result_is_async:
            recovered = MutableAsyncChain()
        else:
            recovered = MutableChain()

        # There are three cases for the middleware: def foo, async def foo, def foo + async def foo_async.
        # 1. def foo. Sync iterables are passed as is, async ones are downgraded.
        # 2. async def foo. Sync iterables are upgraded, async ones are passed as is.
        # 3. def foo + async def foo_async. Iterables are passed to the respective method.
        # Storing methods and method tuples in the same list is weird but we should be able to roll this back
        # when we drop this compatibility feature.

        method_list = islice(self.methods['process_spider_output'],
                             start_index, None)
        for method_index, method_pair in enumerate(method_list,
                                                   start=start_index):
            if method_pair is None:
                continue
            need_upgrade = need_downgrade = False
            if isinstance(method_pair, tuple):
                # This tuple handling is only needed until _async compatibility methods are removed.
                method_sync, method_async = method_pair
                method = method_async if last_result_is_async else method_sync
            else:
                method = method_pair
                if not last_result_is_async and isasyncgenfunction(method):
                    need_upgrade = True
                elif last_result_is_async and not isasyncgenfunction(method):
                    need_downgrade = True
            try:
                if need_upgrade:
                    # Iterable -> AsyncIterable
                    result = as_async_generator(result)
                elif need_downgrade:
                    if not self.downgrade_warning_done:
                        logger.warning(
                            f"Async iterable passed to {method.__qualname__} "
                            f"was downgraded to a non-async one")
                        self.downgrade_warning_done = True
                    assert isinstance(result, AsyncIterable)
                    # AsyncIterable -> Iterable
                    result = yield deferred_from_coro(collect_asyncgen(result))
                    if isinstance(recovered, AsyncIterable):
                        recovered_collected = yield deferred_from_coro(
                            collect_asyncgen(recovered))
                        recovered = MutableChain(recovered_collected)
                # might fail directly if the output value is not a generator
                result = method(response=response,
                                result=result,
                                spider=spider)
            except Exception as ex:
                exception_result = self._process_spider_exception(
                    response, spider, Failure(ex), method_index + 1)
                if isinstance(exception_result, Failure):
                    raise
                return exception_result
            if _isiterable(result):
                result = self._evaluate_iterable(response, spider, result,
                                                 method_index + 1, recovered)
            else:
                if iscoroutine(result):
                    result.close()  # Silence warning about not awaiting
                    msg = (f"{method.__qualname__} must be an asynchronous "
                           f"generator (i.e. use yield)")
                else:
                    msg = (
                        f"{method.__qualname__} must return an iterable, got "
                        f"{type(result)}")
                raise _InvalidOutput(msg)
            last_result_is_async = isinstance(result, AsyncIterable)

        if last_result_is_async:
            return MutableAsyncChain(result, recovered)
        else:
            return MutableChain(result, recovered)  # type: ignore[arg-type]