コード例 #1
0
        def process_spider_output(result, start_index=0):
            # items in this iterable do not need to go through the process_spider_output
            # chain, they went through it already from the process_spider_exception method
            recovered = MutableChain()

            method_list = islice(self.methods['process_spider_output'],
                                 start_index, None)
            for method_index, method in enumerate(method_list,
                                                  start=start_index):
                if method is None:
                    continue
                try:
                    # might fail directly if the output value is not a generator
                    result = method(response=response,
                                    result=result,
                                    spider=spider)
                except Exception as ex:
                    exception_result = process_spider_exception(
                        ex, method_index + 1)
                    if isinstance(exception_result,
                                  (Exception, BaseException)):
                        raise
                    return exception_result
                if _isiterable(result):
                    result = _evaluate_iterable(result, method_index + 1,
                                                recovered)
                else:
                    msg = "Middleware {} must return an iterable, got {}"
                    raise _InvalidOutput(
                        msg.format(_fname(method), type(result)))

            return MutableChain(result, recovered)
コード例 #2
0
    def _process_spider_output(self, response, spider, result, start_index=0):
        # items in this iterable do not need to go through the process_spider_output
        # chain, they went through it already from the process_spider_exception method
        recovered = MutableChain()

        method_list = islice(self.methods['process_spider_output'],
                             start_index, None)
        for method_index, method in enumerate(method_list, start=start_index):
            if method is None:
                continue
            try:
                # might fail directly if the output value is not a generator
                result = method(response=response,
                                result=result,
                                spider=spider)
            except Exception as ex:
                exception_result = self._process_spider_exception(
                    response, spider, Failure(ex), method_index + 1)
                if isinstance(exception_result, Failure):
                    raise
                return exception_result
            if _isiterable(result):
                result = self._evaluate_iterable(response, spider, result,
                                                 method_index + 1, recovered)
            else:
                msg = (f"Middleware {method.__qualname__} must return an "
                       f"iterable, got {type(result)}")
                raise _InvalidOutput(msg)

        return MutableChain(result, recovered)
コード例 #3
0
ファイル: spidermw.py プロジェクト: justinschuster/scrapy
 def _process_callback_output(self, response: Response, spider: Spider,
                              result: Iterable) -> MutableChain:
     recovered = MutableChain()
     result = self._evaluate_iterable(response, spider, result, 0,
                                      recovered)
     return MutableChain(
         self._process_spider_output(response, spider, result), recovered)
コード例 #4
0
ファイル: test_utils_python.py プロジェクト: zz123okl1/scrapy
 def test_mutablechain(self):
     m = MutableChain(range(2), [2, 3], (4, 5))
     m.extend(range(6, 7))
     m.extend([7, 8])
     m.extend([9, 10], (11, 12))
     self.assertEqual(next(m), 0)
     self.assertEqual(m.next(), 1)
     self.assertEqual(m.__next__(), 2)
     self.assertEqual(list(m), list(range(3, 13)))
コード例 #5
0
ファイル: spidermw.py プロジェクト: justinschuster/scrapy
 def _evaluate_iterable(self, response: Response, spider: Spider,
                        iterable: Iterable, exception_processor_index: int,
                        recover_to: MutableChain) -> Generator:
     try:
         for r in iterable:
             yield r
     except Exception as ex:
         exception_result = self._process_spider_exception(
             response, spider, Failure(ex), exception_processor_index)
         if isinstance(exception_result, Failure):
             raise
         recover_to.extend(exception_result)
コード例 #6
0
ファイル: test_utils_python.py プロジェクト: elacuesta/scrapy
 def test_mutablechain(self):
     m = MutableChain(range(2), [2, 3], (4, 5))
     m.extend(range(6, 7))
     m.extend([7, 8])
     m.extend([9, 10], (11, 12))
     self.assertEqual(next(m), 0)
     self.assertEqual(m.next(), 1)
     self.assertEqual(m.__next__(), 2)
     self.assertEqual(list(m), list(range(3, 13)))
コード例 #7
0
 def test_mutablechain(self):
     m = MutableChain(range(2), [2, 3], (4, 5))
     m.extend(range(6, 7))
     m.extend([7, 8])
     m.extend([9, 10], (11, 12))
     self.assertEqual(next(m), 0)
     self.assertEqual(m.__next__(), 1)
     with catch_warnings(record=True) as warnings:
         self.assertEqual(m.next(), 2)
         self.assertEqual(len(warnings), 1)
         self.assertIn('scrapy.utils.python.MutableChain.__next__',
                       str(warnings[0].message))
     self.assertEqual(list(m), list(range(3, 13)))
コード例 #8
0
ファイル: spidermw.py プロジェクト: zjkanjie/scrapy
        def process_spider_output(result, start_index=0):
            # items in this iterable do not need to go through the process_spider_output
            # chain, they went through it already from the process_spider_exception method
            recovered = MutableChain()

            def evaluate_iterable(iterable, index):
                try:
                    for r in iterable:
                        yield r
                except Exception as ex:
                    exception_result = process_spider_exception(Failure(ex), index+1)
                    if isinstance(exception_result, Failure):
                        raise
                    recovered.extend(exception_result)

            method_list = islice(self.methods['process_spider_output'], start_index, None)
            for method_index, method in enumerate(method_list, start=start_index):
                if method is None:
                    continue
                # the following might fail directly if the output value is not a generator
                try:
                    result = method(response=response, result=result, spider=spider)
                except Exception as ex:
                    exception_result = process_spider_exception(Failure(ex), method_index+1)
                    if isinstance(exception_result, Failure):
                        raise
                    return exception_result
                if _isiterable(result):
                    result = evaluate_iterable(result, method_index)
                else:
                    raise _InvalidOutput('Middleware {} must return an iterable, got {}' \
                                         .format(fname(method), type(result)))

            return chain(result, recovered)
コード例 #9
0
ファイル: spidermw.py プロジェクト: wwjiang007/scrapy
 async def _process_callback_output(
     self, response: Response, spider: Spider, result: Union[Iterable,
                                                             AsyncIterable]
 ) -> Union[MutableChain, MutableAsyncChain]:
     recovered: Union[MutableChain, MutableAsyncChain]
     if isinstance(result, AsyncIterable):
         recovered = MutableAsyncChain()
     else:
         recovered = MutableChain()
     result = self._evaluate_iterable(response, spider, result, 0,
                                      recovered)
     result = await maybe_deferred_to_future(
         self._process_spider_output(response, spider, result))
     if isinstance(result, AsyncIterable):
         return MutableAsyncChain(result, recovered)
     else:
         if isinstance(recovered, AsyncIterable):
             recovered_collected = await collect_asyncgen(recovered)
             recovered = MutableChain(recovered_collected)
         return MutableChain(result, recovered)  # type: ignore[arg-type]
コード例 #10
0
 def process_callback_output(result):
     recovered = MutableChain()
     result = _evaluate_iterable(result, 0, recovered)
     return MutableChain(process_spider_output(result), recovered)
コード例 #11
0
ファイル: spidermw.py プロジェクト: wwjiang007/scrapy
    def _process_spider_output(self,
                               response: Response,
                               spider: Spider,
                               result: Union[Iterable, AsyncIterable],
                               start_index: int = 0) -> Deferred:
        # items in this iterable do not need to go through the process_spider_output
        # chain, they went through it already from the process_spider_exception method
        recovered: Union[MutableChain, MutableAsyncChain]
        last_result_is_async = isinstance(result, AsyncIterable)
        if last_result_is_async:
            recovered = MutableAsyncChain()
        else:
            recovered = MutableChain()

        # There are three cases for the middleware: def foo, async def foo, def foo + async def foo_async.
        # 1. def foo. Sync iterables are passed as is, async ones are downgraded.
        # 2. async def foo. Sync iterables are upgraded, async ones are passed as is.
        # 3. def foo + async def foo_async. Iterables are passed to the respective method.
        # Storing methods and method tuples in the same list is weird but we should be able to roll this back
        # when we drop this compatibility feature.

        method_list = islice(self.methods['process_spider_output'],
                             start_index, None)
        for method_index, method_pair in enumerate(method_list,
                                                   start=start_index):
            if method_pair is None:
                continue
            need_upgrade = need_downgrade = False
            if isinstance(method_pair, tuple):
                # This tuple handling is only needed until _async compatibility methods are removed.
                method_sync, method_async = method_pair
                method = method_async if last_result_is_async else method_sync
            else:
                method = method_pair
                if not last_result_is_async and isasyncgenfunction(method):
                    need_upgrade = True
                elif last_result_is_async and not isasyncgenfunction(method):
                    need_downgrade = True
            try:
                if need_upgrade:
                    # Iterable -> AsyncIterable
                    result = as_async_generator(result)
                elif need_downgrade:
                    if not self.downgrade_warning_done:
                        logger.warning(
                            f"Async iterable passed to {method.__qualname__} "
                            f"was downgraded to a non-async one")
                        self.downgrade_warning_done = True
                    assert isinstance(result, AsyncIterable)
                    # AsyncIterable -> Iterable
                    result = yield deferred_from_coro(collect_asyncgen(result))
                    if isinstance(recovered, AsyncIterable):
                        recovered_collected = yield deferred_from_coro(
                            collect_asyncgen(recovered))
                        recovered = MutableChain(recovered_collected)
                # might fail directly if the output value is not a generator
                result = method(response=response,
                                result=result,
                                spider=spider)
            except Exception as ex:
                exception_result = self._process_spider_exception(
                    response, spider, Failure(ex), method_index + 1)
                if isinstance(exception_result, Failure):
                    raise
                return exception_result
            if _isiterable(result):
                result = self._evaluate_iterable(response, spider, result,
                                                 method_index + 1, recovered)
            else:
                if iscoroutine(result):
                    result.close()  # Silence warning about not awaiting
                    msg = (f"{method.__qualname__} must be an asynchronous "
                           f"generator (i.e. use yield)")
                else:
                    msg = (
                        f"{method.__qualname__} must return an iterable, got "
                        f"{type(result)}")
                raise _InvalidOutput(msg)
            last_result_is_async = isinstance(result, AsyncIterable)

        if last_result_is_async:
            return MutableAsyncChain(result, recovered)
        else:
            return MutableChain(result, recovered)  # type: ignore[arg-type]