def basic_items_check(items, obligate_fields, primary_fields, request_url):
    for item in items:
        if not set(
                item.keys()).intersection(obligate_fields) == obligate_fields:
            missing_fields = obligate_fields.difference(item.keys())
            raise _InvalidOutput(
                "Obligate fields check failed. Request url: %s. "
                "Missing fields: %s" % (request_url, missing_fields))
        for field in primary_fields:
            if not item.get(field, ""):
                raise _InvalidOutput(
                    "Primary fields check failed. Request url: %s. "
                    "Empty field: %s" % (request_url, field))
        def process_spider_output(result, start_index=0):
            # items in this iterable do not need to go through the process_spider_output
            # chain, they went through it already from the process_spider_exception method
            recovered = MutableChain()

            method_list = islice(self.methods['process_spider_output'],
                                 start_index, None)
            for method_index, method in enumerate(method_list,
                                                  start=start_index):
                if method is None:
                    continue
                try:
                    # might fail directly if the output value is not a generator
                    result = method(response=response,
                                    result=result,
                                    spider=spider)
                except Exception as ex:
                    exception_result = process_spider_exception(
                        ex, method_index + 1)
                    if isinstance(exception_result,
                                  (Exception, BaseException)):
                        raise
                    return exception_result
                if _isiterable(result):
                    result = _evaluate_iterable(result, method_index + 1,
                                                recovered)
                else:
                    msg = "Middleware {} must return an iterable, got {}"
                    raise _InvalidOutput(
                        msg.format(_fname(method), type(result)))

            return MutableChain(result, recovered)
Esempio n. 3
0
 def process_spider_exception(_failure, start_index=0):
     exception = _failure.value
     # don't handle _InvalidOutput exception
     if isinstance(exception, _InvalidOutput):
         return _failure
     method_list = islice(
         self.methods['process_spider_exception'],
         start_index,
         None)
     # 执行一系列爬虫中间件的process_spider_exception
     for method_index, method in enumerate(
             method_list, start=start_index):
         if method is None:
             continue
         result = method(
             response=response,
             exception=exception,
             spider=spider)
         if _isiterable(result):
             # stop exception handling by handing control over to the
             # process_spider_output chain if an iterable has been
             # returned
             return process_spider_output(result, method_index + 1)
         elif result is None:
             continue
         else:
             raise _InvalidOutput(
                 'Middleware {} must return None or an iterable, got {}' .format(
                     fname(method), type(result)))
     return _failure
Esempio n. 4
0
    def _process_spider_output(self, response, spider, result, start_index=0):
        # items in this iterable do not need to go through the process_spider_output
        # chain, they went through it already from the process_spider_exception method
        recovered = MutableChain()

        method_list = islice(self.methods['process_spider_output'],
                             start_index, None)
        for method_index, method in enumerate(method_list, start=start_index):
            if method is None:
                continue
            try:
                # might fail directly if the output value is not a generator
                result = method(response=response,
                                result=result,
                                spider=spider)
            except Exception as ex:
                exception_result = self._process_spider_exception(
                    response, spider, Failure(ex), method_index + 1)
                if isinstance(exception_result, Failure):
                    raise
                return exception_result
            if _isiterable(result):
                result = self._evaluate_iterable(response, spider, result,
                                                 method_index + 1, recovered)
            else:
                msg = (f"Middleware {method.__qualname__} must return an "
                       f"iterable, got {type(result)}")
                raise _InvalidOutput(msg)

        return MutableChain(result, recovered)
Esempio n. 5
0
 def _process_spider_exception(self,
                               response,
                               spider,
                               _failure,
                               start_index=0):
     exception = _failure.value
     # don't handle _InvalidOutput exception
     if isinstance(exception, _InvalidOutput):
         return _failure
     method_list = islice(self.methods['process_spider_exception'],
                          start_index, None)
     for method_index, method in enumerate(method_list, start=start_index):
         if method is None:
             continue
         result = method(response=response,
                         exception=exception,
                         spider=spider)
         if _isiterable(result):
             # stop exception handling by handing control over to the
             # process_spider_output chain if an iterable has been returned
             return self._process_spider_output(response, spider, result,
                                                method_index + 1)
         elif result is None:
             continue
         else:
             msg = (f"Middleware {method.__qualname__} must return None "
                    f"or an iterable, got {type(result)}")
             raise _InvalidOutput(msg)
     return _failure
Esempio n. 6
0
        def process_spider_output(result, start_index=0):
            # items in this iterable do not need to go through the process_spider_output
            # chain, they went through it already from the process_spider_exception method
            recovered = MutableChain()

            def evaluate_iterable(iterable, index):
                try:
                    for r in iterable:
                        yield r
                except Exception as ex:
                    exception_result = process_spider_exception(Failure(ex), index+1)
                    if isinstance(exception_result, Failure):
                        raise
                    recovered.extend(exception_result)

            method_list = islice(self.methods['process_spider_output'], start_index, None)
            for method_index, method in enumerate(method_list, start=start_index):
                if method is None:
                    continue
                # the following might fail directly if the output value is not a generator
                try:
                    result = method(response=response, result=result, spider=spider)
                except Exception as ex:
                    exception_result = process_spider_exception(Failure(ex), method_index+1)
                    if isinstance(exception_result, Failure):
                        raise
                    return exception_result
                if _isiterable(result):
                    result = evaluate_iterable(result, method_index)
                else:
                    raise _InvalidOutput('Middleware {} must return an iterable, got {}' \
                                         .format(fname(method), type(result)))

            return chain(result, recovered)
Esempio n. 7
0
        def process_spider_output(result, start_index=0):
            # items in this iterable do not need to go through the process_spider_output
            # chain, they went through it already from the process_spider_exception method
            recovered = MutableChain()

            def evaluate_iterable(iterable, index):
                try:
                    for r in iterable:
                        yield r
                except Exception as ex:
                    exception_result = process_spider_exception(Failure(ex), index+1)
                    if isinstance(exception_result, Failure):
                        raise
                    recovered.extend(exception_result)

            method_list = islice(self.methods['process_spider_output'], start_index, None)
            for method_index, method in enumerate(method_list, start=start_index):
                if method is None:
                    continue
                # the following might fail directly if the output value is not a generator
                try:
                    result = method(response=response, result=result, spider=spider)
                except Exception as ex:
                    exception_result = process_spider_exception(Failure(ex), method_index+1)
                    if isinstance(exception_result, Failure):
                        raise
                    return exception_result
                if _isiterable(result):
                    result = evaluate_iterable(result, method_index)
                else:
                    raise _InvalidOutput('Middleware {} must return an iterable, got {}' \
                                         .format(fname(method), type(result)))

            return chain(result, recovered)
Esempio n. 8
0
 def process_request(request):
     for method in self.methods['process_request']:
         response = yield method(request=request, spider=spider)
         if response is not None and not isinstance(response, (Response, Request)):
             raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \
                                  (six.get_method_self(method).__class__.__name__, response.__class__.__name__))
         if response:
             defer.returnValue(response)
     defer.returnValue((yield download_func(request=request, spider=spider)))
Esempio n. 9
0
 def process_exception(_failure):
     exception = _failure.value
     for method in self.methods['process_exception']:
         response = yield method(request=request, exception=exception, spider=spider)
         if response is not None and not isinstance(response, (Response, Request)):
             raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \
                                  (six.get_method_self(method).__class__.__name__, type(response)))
         if response:
             defer.returnValue(response)
     defer.returnValue(_failure)
Esempio n. 10
0
 def process_request(request):
     for method in self.methods['process_request']:
         response = yield method(request=request, spider=spider)
         if response is not None and not isinstance(
                 response, (Response, Request)):
             raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \
                                  (method.__self__.__class__.__name__, response.__class__.__name__))
         if response:
             defer.returnValue(response)
     defer.returnValue((yield download_func(request=request,
                                            spider=spider)))
Esempio n. 11
0
 def process_request(request):
     for method in self.methods['process_request']:
         response = yield deferred_from_coro(method(request=request, spider=spider))
         if response is not None and not isinstance(response, (Response, Request)):
             raise _InvalidOutput(
                 "Middleware %s.process_request must return None, Response or Request, got %s"
                 % (method.__self__.__class__.__name__, response.__class__.__name__)
             )
         if response:
             return response
     return (yield download_func(request=request, spider=spider))
Esempio n. 12
0
 def _process_spider_exception(
         self,
         response: Response,
         spider: Spider,
         _failure: Failure,
         start_index: int = 0) -> Union[Failure, MutableChain]:
     exception = _failure.value
     # don't handle _InvalidOutput exception
     if isinstance(exception, _InvalidOutput):
         return _failure
     method_list = islice(self.methods['process_spider_exception'],
                          start_index, None)
     for method_index, method in enumerate(method_list, start=start_index):
         if method is None:
             continue
         method = cast(Callable, method)
         result = method(response=response,
                         exception=exception,
                         spider=spider)
         if _isiterable(result):
             # stop exception handling by handing control over to the
             # process_spider_output chain if an iterable has been returned
             dfd: Deferred = self._process_spider_output(
                 response, spider, result, method_index + 1)
             # _process_spider_output() returns a Deferred only because of downgrading so this can be
             # simplified when downgrading is removed.
             if dfd.called:
                 # the result is available immediately if _process_spider_output didn't do downgrading
                 return dfd.result
             else:
                 # we forbid waiting here because otherwise we would need to return a deferred from
                 # _process_spider_exception too, which complicates the architecture
                 msg = f"Async iterable returned from {method.__qualname__} cannot be downgraded"
                 raise _InvalidOutput(msg)
         elif result is None:
             continue
         else:
             msg = (f"{method.__qualname__} must return None "
                    f"or an iterable, got {type(result)}")
             raise _InvalidOutput(msg)
     return _failure
Esempio n. 13
0
 def process_spider_input(response):
     for method in self.methods['process_spider_input']:
         try:
             result = method(response=response, spider=spider)
             if result is not None:
                 raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \
                                      .format(fname(method), type(result)))
         except _InvalidOutput:
             raise
         except Exception:
             return scrape_func(Failure(), request, spider)
     return scrape_func(response, request, spider)
Esempio n. 14
0
 def process_spider_input(response):
     for method in self.methods['process_spider_input']:
         try:
             result = method(response=response, spider=spider)
             if result is not None:
                 raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \
                                      .format(fname(method), type(result)))
         except _InvalidOutput:
             raise
         except Exception:
             return scrape_func(Failure(), request, spider)
     return scrape_func(response, request, spider)
Esempio n. 15
0
 def process_exception(failure):
     exception = failure.value
     for method in self.methods['process_exception']:
         response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
         if response is not None and not isinstance(response, (Response, Request)):
             raise _InvalidOutput(
                 "Middleware %s.process_exception must return None, Response or Request, got %s"
                 % (method.__self__.__class__.__name__, type(response))
             )
         if response:
             return response
     return failure
def check_req_rules(reqclass, requests, request_url):
    reqclass_attrs = [(name, getattr(reqclass, name)) for name in dir(reqclass)
                      if not name.startswith('__')]
    req_rules = list(filter(lambda entry: callable(entry[1]), reqclass_attrs))
    for req in requests:
        for rule_func in req_rules:
            try:
                rule_func[1](req)
            except AssertionError:
                raise _InvalidOutput(
                    "A request produced by the request with url %s has "
                    "failed the rule %s" % (request_url, rule_func[0]))
Esempio n. 17
0
 def process_exception(failure):
     exception = failure.value
     for method in self.methods['process_exception']:
         response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
         if response is not None and not isinstance(response, (Response, Request)):
             raise _InvalidOutput(
                 f"Middleware {method.__qualname__} must return None, Response or "
                 f"Request, got {type(response)}"
             )
         if response:
             return response
     return failure
Esempio n. 18
0
 def process_request(request: Request):
     for method in self.methods['process_request']:
         method = cast(Callable, method)
         response = yield deferred_from_coro(
             method(request=request, spider=spider))
         if response is not None and not isinstance(
                 response, (Response, Request)):
             raise _InvalidOutput(
                 f"Middleware {method.__qualname__} must return None, Response or "
                 f"Request, got {response.__class__.__name__}")
         if response:
             return response
     return (yield download_func(request=request, spider=spider))
Esempio n. 19
0
        def process_response(response):
            assert response is not None, 'Received None in process_response'
            if isinstance(response, Request):
                defer.returnValue(response)

            for method in self.methods['process_response']:
                response = yield method(request=request, response=response, spider=spider)
                if not isinstance(response, (Response, Request)):
                    raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \
                                         (six.get_method_self(method).__class__.__name__, type(response)))
                if isinstance(response, Request):
                    defer.returnValue(response)
            defer.returnValue(response)
def check_item_rules(itemclass, items, request_url):
    itemclass_attrs = [(name, getattr(itemclass, name))
                       for name in dir(itemclass) if not name.startswith('__')]
    item_rules = list(filter(lambda entry: callable(entry[1]),
                             itemclass_attrs))
    for item in items:
        for rule_func in item_rules:
            try:
                rule_func[1](item)
            except AssertionError:
                raise _InvalidOutput(
                    "An item produced by the request with url %s has "
                    "failed the rule %s" % (request_url, rule_func[0]))
Esempio n. 21
0
 def _process_spider_input(self, scrape_func, response, request, spider):
     for method in self.methods["process_spider_input"]:
         try:
             result = method(response=response, spider=spider)
             if result is not None:
                 msg = (f"Middleware {_fname(method)} must return None "
                        f"or raise an exception, got {type(result)}")
                 raise _InvalidOutput(msg)
         except _InvalidOutput:
             raise
         except Exception:
             return scrape_func(Failure(), request, spider)
     return scrape_func(response, request, spider)
Esempio n. 22
0
 def process_exception(_failure):
     exception = _failure.value
     for method in self.methods['process_exception']:
         response = yield method(request=request,
                                 exception=exception,
                                 spider=spider)
         if response is not None and not isinstance(
                 response, (Response, Request)):
             raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \
                                  (method.__self__.__class__.__name__, type(response)))
         if response:
             defer.returnValue(response)
     defer.returnValue(_failure)
Esempio n. 23
0
        def process_response(response):
            assert response is not None, 'Received None in process_response'
            if isinstance(response, Request):
                defer.returnValue(response)

            for method in self.methods['process_response']:
                response = yield deferred_from_coro(
                    method(request=request, response=response, spider=spider))
                if not isinstance(response, (Response, Request)):
                    raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \
                                         (method.__self__.__class__.__name__, type(response)))
                if isinstance(response, Request):
                    defer.returnValue(response)
            defer.returnValue(response)
Esempio n. 24
0
 def process_spider_input(response):
     for method in self.methods[
             'process_spider_input']:  #调用所有中间件 处理response
         try:
             result = method(response=response, spider=spider)
             if result is not None:  # 中间件的process_spider_input 方法需要返回None
                 msg = (f"Middleware {_fname(method)} must return None "
                        f"or raise an exception, got {type(result)}")
                 raise _InvalidOutput(msg)
         except _InvalidOutput:
             raise
         except Exception:
             return scrape_func(Failure(), request, spider)
     return scrape_func(response, request, spider)
Esempio n. 25
0
 def process_request(request):
     for method in self.methods['process_request']:
         response = yield deferred_from_coro(
             method(request=request, spider=spider)
         )  # 这个deferred_from_coro方法是将 middleware的方法 从 asyncio 转化为 recator方法 并yield出去
         if response is not None and not isinstance(
                 response, (Response, Request)):
             raise _InvalidOutput(
                 f"Middleware {method.__self__.__class__.__name__}"
                 ".process_request must return None, Response or "
                 f"Request, got {response.__class__.__name__}")
         if response:
             return response
     return (yield download_func(request=request, spider=spider))
Esempio n. 26
0
 async def process_request(self, spider, request):
     for method in self.methods['process_request']:
         if iscoroutinefunction(method):
             response = await method(request=request, spider=spider)
         else:
             response = method(request=request, spider=spider)
         if response is not None and not isinstance(response,
                                                    (Response, Request)):
             raise _InvalidOutput(
                 "Middleware %s.process_request must return None, Response or Request, got %s"
                 % (method.__self__.__class__.__name__,
                    response.__class__.__name__))
         if response:
             return response
Esempio n. 27
0
 def process_request(request):
     for method in self.methods[
             'process_request']:  # 挨个执行下载器中间件的process_request方法
         response = yield method(
             request=request, spider=spider
         )  # 看,process_request方法就是在此处把requests和spider传入的
         if response is not None and not isinstance(
                 response, (Response, Request)):
             raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \
                                  (six.get_method_self(method).__class__.__name__, response.__class__.__name__))
         if response:
             defer.returnValue(response)
     defer.returnValue(
         (yield download_func(request=request, spider=spider)
          ))  # 如果下载器中间件没有返回值,则执行注册进来的方法,也就是Downloader的_enqueue_request
Esempio n. 28
0
 def process_spider_input(
         response):  # 即引擎把response交给spider的时候,经过爬虫中间件,调用这个函数处理
     for method in self.methods[
             'process_spider_input']:  # 执行一系列爬虫中间件的process_spider_input
         try:
             result = method(response=response, spider=spider)
             if result is not None:
                 raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \
                                      .format(fname(method), type(result)))
         except _InvalidOutput:
             raise
         except Exception:
             return scrape_func(Failure(), request, spider)
     return scrape_func(response, request,
                        spider)  # 中间件处理之后,这里注册一个事件到事件循环,等到调用爬虫的parse方法
Esempio n. 29
0
 def process_request(request):
     # 执行下载器中间件的process_request方法, 下载前依次进行加工, 处理, 校验等
     for method in self.methods['process_request']:
         response = yield method(request=request, spider=spider)
         if response is not None and not isinstance(
                 response, (Response, Request)):
             raise _InvalidOutput(
                 'Middleware %s.process_request must return None, Response or Request, got %s'
                 % (six.get_method_self(method).__class__.__name__,
                    response.__class__.__name__))
         if response:
             defer.returnValue(response)
     # 这里才是真正的下载, download_func == _enqueue_request方法
     defer.returnValue((yield download_func(request=request,
                                            spider=spider)))
Esempio n. 30
0
        def process_response(response: Union[Response, Request]):
            if response is None:
                raise TypeError("Received None in process_response")
            elif isinstance(response, Request):
                return response

            for method in self.methods['process_response']:
                response = yield deferred_from_coro(
                    method(request=request, response=response, spider=spider))
                if not isinstance(response, (Response, Request)):
                    raise _InvalidOutput(
                        f"Middleware {method.__qualname__} must return Response or Request, "
                        f"got {type(response)}")
                if isinstance(response, Request):
                    return response
            return response
Esempio n. 31
0
        def process_response(response):
            if response is None:
                raise TypeError("Received None in process_response")
            elif isinstance(response, Request):
                return response

            for method in self.methods['process_response']:
                response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
                if not isinstance(response, (Response, Request)):
                    raise _InvalidOutput(
                        "Middleware %s.process_response must return Response or Request, got %s"
                        % (method.__self__.__class__.__name__, type(response))
                    )
                if isinstance(response, Request):
                    return response
            return response
Esempio n. 32
0
 def process_spider_input(response):
     for method in self.methods['process_spider_input']:
         logger.info('[spider_input] Processing %s pages in url: %s',
                     len(response.body), request.url)
         try:
             result = method(response=response, spider=spider)
             if result is not None:
                 raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \
                                      .format(fname(method), type(result)))
         except _InvalidOutput:
             raise
         except Exception:
             return scrape_func(Failure(), request, spider)
     logger.info('[spider_input] Processing input for url: %s',
                 request.url)
     return scrape_func(response, request, spider)
Esempio n. 33
0
 def _process_spider_input(self, scrape_func: ScrapeFunc,
                           response: Response, request: Request,
                           spider: Spider) -> Any:
     for method in self.methods['process_spider_input']:
         try:
             result = method(response=response, spider=spider)
             if result is not None:
                 msg = (
                     f"Middleware {method.__qualname__} must return None "
                     f"or raise an exception, got {type(result)}")
                 raise _InvalidOutput(msg)
         except _InvalidOutput:
             raise
         except Exception:
             return scrape_func(Failure(), request, spider)
     return scrape_func(response, request, spider)
Esempio n. 34
0
 def process_spider_input(response):
     for method in self.methods['process_spider_input']:
         try:
             result = method(response=response, spider=spider)
             if result is not None:
                 msg = "Middleware {} must return None or raise an exception, got {}"
                 raise _InvalidOutput(
                     msg.format(_fname(method), type(result)))
         except _InvalidOutput:
             raise
         except Exception as exception:
             iterable_or_exception = scrape_func(
                 exception, request, spider)
             if iterable_or_exception is exception:
                 raise iterable_or_exception
             return iterable_or_exception
     return scrape_func(response, request, spider)
Esempio n. 35
0
 def process_response(response):
     assert response is not None, 'Received None in process_response'
     if isinstance(response, Request):
         defer.returnValue(response)
     # 如果下载器中间件定义process_response, 则依次执行
     for method in self.methods['process_response']:
         response = yield method(request=request,
                                 response=response,
                                 spider=spider)
         if not isinstance(response, (Response, Request)):
             raise _InvalidOutput(
                 'Middleware %s.process_response must return Response or Request, got %s'
                 % (six.get_method_self(method).__class__.__name__,
                    type(response)))
         if isinstance(response, Request):
             defer.returnValue(response)
     defer.returnValue(response)
Esempio n. 36
0
 def process_spider_exception(_failure, start_index=0):
     exception = _failure.value
     # don't handle _InvalidOutput exception
     if isinstance(exception, _InvalidOutput):
         return _failure
     method_list = islice(self.methods['process_spider_exception'], start_index, None)
     for method_index, method in enumerate(method_list, start=start_index):
         if method is None:
             continue
         result = method(response=response, exception=exception, spider=spider)
         if _isiterable(result):
             # stop exception handling by handing control over to the
             # process_spider_output chain if an iterable has been returned
             return process_spider_output(result, method_index+1)
         elif result is None:
             continue
         else:
             raise _InvalidOutput('Middleware {} must return None or an iterable, got {}' \
                                  .format(fname(method), type(result)))
     return _failure