def process_spider_output(result, start_index=0): # items in this iterable do not need to go through the process_spider_output # chain, they went through it already from the process_spider_exception method recovered = MutableChain() method_list = islice(self.methods['process_spider_output'], start_index, None) for method_index, method in enumerate(method_list, start=start_index): if method is None: continue try: # might fail directly if the output value is not a generator result = method(response=response, result=result, spider=spider) except Exception as ex: exception_result = process_spider_exception( ex, method_index + 1) if isinstance(exception_result, (Exception, BaseException)): raise return exception_result if _isiterable(result): result = _evaluate_iterable(result, method_index + 1, recovered) else: msg = "Middleware {} must return an iterable, got {}" raise _InvalidOutput( msg.format(_fname(method), type(result))) return MutableChain(result, recovered)
def _process_spider_output(self, response, spider, result, start_index=0): # items in this iterable do not need to go through the process_spider_output # chain, they went through it already from the process_spider_exception method recovered = MutableChain() method_list = islice(self.methods['process_spider_output'], start_index, None) for method_index, method in enumerate(method_list, start=start_index): if method is None: continue try: # might fail directly if the output value is not a generator result = method(response=response, result=result, spider=spider) except Exception as ex: exception_result = self._process_spider_exception( response, spider, Failure(ex), method_index + 1) if isinstance(exception_result, Failure): raise return exception_result if _isiterable(result): result = self._evaluate_iterable(response, spider, result, method_index + 1, recovered) else: msg = (f"Middleware {method.__qualname__} must return an " f"iterable, got {type(result)}") raise _InvalidOutput(msg) return MutableChain(result, recovered)
def _process_callback_output(self, response: Response, spider: Spider, result: Iterable) -> MutableChain: recovered = MutableChain() result = self._evaluate_iterable(response, spider, result, 0, recovered) return MutableChain( self._process_spider_output(response, spider, result), recovered)
def test_mutablechain(self): m = MutableChain(range(2), [2, 3], (4, 5)) m.extend(range(6, 7)) m.extend([7, 8]) m.extend([9, 10], (11, 12)) self.assertEqual(next(m), 0) self.assertEqual(m.next(), 1) self.assertEqual(m.__next__(), 2) self.assertEqual(list(m), list(range(3, 13)))
def _evaluate_iterable(self, response: Response, spider: Spider, iterable: Iterable, exception_processor_index: int, recover_to: MutableChain) -> Generator: try: for r in iterable: yield r except Exception as ex: exception_result = self._process_spider_exception( response, spider, Failure(ex), exception_processor_index) if isinstance(exception_result, Failure): raise recover_to.extend(exception_result)
def test_mutablechain(self): m = MutableChain(range(2), [2, 3], (4, 5)) m.extend(range(6, 7)) m.extend([7, 8]) m.extend([9, 10], (11, 12)) self.assertEqual(next(m), 0) self.assertEqual(m.__next__(), 1) with catch_warnings(record=True) as warnings: self.assertEqual(m.next(), 2) self.assertEqual(len(warnings), 1) self.assertIn('scrapy.utils.python.MutableChain.__next__', str(warnings[0].message)) self.assertEqual(list(m), list(range(3, 13)))
def process_spider_output(result, start_index=0): # items in this iterable do not need to go through the process_spider_output # chain, they went through it already from the process_spider_exception method recovered = MutableChain() def evaluate_iterable(iterable, index): try: for r in iterable: yield r except Exception as ex: exception_result = process_spider_exception(Failure(ex), index+1) if isinstance(exception_result, Failure): raise recovered.extend(exception_result) method_list = islice(self.methods['process_spider_output'], start_index, None) for method_index, method in enumerate(method_list, start=start_index): if method is None: continue # the following might fail directly if the output value is not a generator try: result = method(response=response, result=result, spider=spider) except Exception as ex: exception_result = process_spider_exception(Failure(ex), method_index+1) if isinstance(exception_result, Failure): raise return exception_result if _isiterable(result): result = evaluate_iterable(result, method_index) else: raise _InvalidOutput('Middleware {} must return an iterable, got {}' \ .format(fname(method), type(result))) return chain(result, recovered)
async def _process_callback_output( self, response: Response, spider: Spider, result: Union[Iterable, AsyncIterable] ) -> Union[MutableChain, MutableAsyncChain]: recovered: Union[MutableChain, MutableAsyncChain] if isinstance(result, AsyncIterable): recovered = MutableAsyncChain() else: recovered = MutableChain() result = self._evaluate_iterable(response, spider, result, 0, recovered) result = await maybe_deferred_to_future( self._process_spider_output(response, spider, result)) if isinstance(result, AsyncIterable): return MutableAsyncChain(result, recovered) else: if isinstance(recovered, AsyncIterable): recovered_collected = await collect_asyncgen(recovered) recovered = MutableChain(recovered_collected) return MutableChain(result, recovered) # type: ignore[arg-type]
def process_callback_output(result): recovered = MutableChain() result = _evaluate_iterable(result, 0, recovered) return MutableChain(process_spider_output(result), recovered)
def _process_spider_output(self, response: Response, spider: Spider, result: Union[Iterable, AsyncIterable], start_index: int = 0) -> Deferred: # items in this iterable do not need to go through the process_spider_output # chain, they went through it already from the process_spider_exception method recovered: Union[MutableChain, MutableAsyncChain] last_result_is_async = isinstance(result, AsyncIterable) if last_result_is_async: recovered = MutableAsyncChain() else: recovered = MutableChain() # There are three cases for the middleware: def foo, async def foo, def foo + async def foo_async. # 1. def foo. Sync iterables are passed as is, async ones are downgraded. # 2. async def foo. Sync iterables are upgraded, async ones are passed as is. # 3. def foo + async def foo_async. Iterables are passed to the respective method. # Storing methods and method tuples in the same list is weird but we should be able to roll this back # when we drop this compatibility feature. method_list = islice(self.methods['process_spider_output'], start_index, None) for method_index, method_pair in enumerate(method_list, start=start_index): if method_pair is None: continue need_upgrade = need_downgrade = False if isinstance(method_pair, tuple): # This tuple handling is only needed until _async compatibility methods are removed. method_sync, method_async = method_pair method = method_async if last_result_is_async else method_sync else: method = method_pair if not last_result_is_async and isasyncgenfunction(method): need_upgrade = True elif last_result_is_async and not isasyncgenfunction(method): need_downgrade = True try: if need_upgrade: # Iterable -> AsyncIterable result = as_async_generator(result) elif need_downgrade: if not self.downgrade_warning_done: logger.warning( f"Async iterable passed to {method.__qualname__} " f"was downgraded to a non-async one") self.downgrade_warning_done = True assert isinstance(result, AsyncIterable) # AsyncIterable -> Iterable result = yield deferred_from_coro(collect_asyncgen(result)) if isinstance(recovered, AsyncIterable): recovered_collected = yield deferred_from_coro( collect_asyncgen(recovered)) recovered = MutableChain(recovered_collected) # might fail directly if the output value is not a generator result = method(response=response, result=result, spider=spider) except Exception as ex: exception_result = self._process_spider_exception( response, spider, Failure(ex), method_index + 1) if isinstance(exception_result, Failure): raise return exception_result if _isiterable(result): result = self._evaluate_iterable(response, spider, result, method_index + 1, recovered) else: if iscoroutine(result): result.close() # Silence warning about not awaiting msg = (f"{method.__qualname__} must be an asynchronous " f"generator (i.e. use yield)") else: msg = ( f"{method.__qualname__} must return an iterable, got " f"{type(result)}") raise _InvalidOutput(msg) last_result_is_async = isinstance(result, AsyncIterable) if last_result_is_async: return MutableAsyncChain(result, recovered) else: return MutableChain(result, recovered) # type: ignore[arg-type]