Ejemplo n.º 1
0
 def __init__(self, spider, middlewire=None, pipline: Piplines = None):
     self.reminder = reminder
     self.log = log
     self.lock = None
     self.task_dict: Dict[str, asyncio.Task] = {}
     self.pip_task_dict: Dict[str, asyncio.Task] = {}
     self.spider = spider
     self.middlewire = middlewire
     self.piplines = pipline
     duplicate_filter_class = self._get_dynamic_class_setting("duplicate_filter_class")
     scheduler_container_class = self._get_dynamic_class_setting("scheduler_container_class")
     net_download_class = self._get_dynamic_class_setting("net_download_class")
     self.scheduler = Scheduler(duplicate_filter_class(), scheduler_container_class())
     req_per_concurrent = self.spider.cutome_setting_dict.get("req_per_concurrent") or gloable_setting_dict.get(
         "req_per_concurrent")
     single = self.spider.cutome_setting_dict.get("is_single")
     self.is_single = gloable_setting_dict.get("is_single") if single is None else single
     self.downloader = Downloader(self.scheduler, self.middlewire, reminder=self.reminder,
                                  seq=req_per_concurrent,
                                  downer=net_download_class())
     self.request_generator_queue = deque()
     self.stop = False
     self.condition = asyncio.Condition()
     self.item_queue = asyncio.Queue()
     pipline_is_paralleled = self.spider.cutome_setting_dict.get("pipline_is_paralleled")
     pipline_is_paralleled = gloable_setting_dict.get(
         "pipline_is_paralleled") if pipline_is_paralleled is None else pipline_is_paralleled
     self.pipline_is_paralleled = pipline_is_paralleled
Ejemplo n.º 2
0
 def __init__(self, spider, middlewire=None, pipline: Piplines = None):
     self.lock = None
     self.task_dict: Dict[str, asyncio.Task] = {}
     self.pip_task_dict: Dict[str, asyncio.Task] = {}
     self.spider = spider
     self.middlewire = middlewire
     self.piplines = pipline
     self.reminder = reminder
     duplicate_filter_class = self._get_dynamic_class_setting(
         "duplicate_filter_class")
     scheduler_container_class = self._get_dynamic_class_setting(
         "scheduler_container_class")
     net_download_class = self._get_dynamic_class_setting(
         "net_download_class")
     self.scheduler = Scheduler(duplicate_filter_class(),
                                scheduler_container_class())
     req_per_concurrent = self.spider.cutome_setting_dict.get(
         "req_per_concurrent") or gloable_setting_dict.get(
             "req_per_concurrent")
     self.downloader = Downloader(self.scheduler,
                                  self.middlewire,
                                  seq=req_per_concurrent,
                                  reminder=self.reminder,
                                  downer=net_download_class())
     self.request_generator_queue = deque()
     self.stop = False
     self.log = log
Ejemplo n.º 3
0
 def _get_dynamic_class_setting(self, key):
     class_str = self.spider.cutome_setting_dict.get(
         key) or gloable_setting_dict.get(
         key)
     _module = importlib.import_module(".".join(class_str.split(".")[:-1]))
     _class = getattr(_module, class_str.split(".")[-1])
     return _class
Ejemplo n.º 4
0
 def _get_dynamic_class_setting(self, key):
     class_str = self.spider.cutome_setting_dict.get(
         key) or gloable_setting_dict.get(key)
     _module = importlib.import_module(".".join(class_str.split(".")[:-1]))
     _class = getattr(_module, class_str.split(".")[-1])
     self.log.info(
         f"dynamic loaded  key【{key}】--> class【{class_str}】success")
     return _class
Ejemplo n.º 5
0
 def __init__(self, loop=None):
     if sys.platform == "win32":
         # avoid a certain extent: too many files error
         loop = loop or asyncio.ProactorEventLoop()
     else:
         loop = loop or asyncio.new_event_loop()
     thread_pool_max_size = gloable_setting_dict.get(
         "thread_pool_max_size", 30)
     loop.set_default_executor(ThreadPoolExecutor(thread_pool_max_size))
     asyncio.set_event_loop(loop)
     self.loop = loop
     self.cores = []
     self.log = log
     self.spider_names = []
Ejemplo n.º 6
0
 def _check_internet_state(self):
     self.log.info("check internet health")
     error_msg = "internet may not be available please check net, run ended"
     net_healthy_check_url = gloable_setting_dict.get(
         "net_healthy_check_url", None)
     if net_healthy_check_url is None:
         return
     if not is_valid_url(net_healthy_check_url):
         return
     try:
         resp = urlopen(url=net_healthy_check_url, timeout=10)
         if not 200 <= resp.status <= 299:
             raise RuntimeError(error_msg)
     except Exception:
         raise RuntimeError(error_msg)
Ejemplo n.º 7
0
    "selenium",
    # markdown
    "MARKDOWN",
    "build_extension",
    # newspaper
    "calculate_area",
    "largest_image_url",
    "newspaper.images",
    "newspaper",
    "Importing",
    "PIL",
]

# 关闭日志打印
for STOP_LOG in STOP_LOGS:
    log_level = eval("logging." +
                     gloable_setting_dict.get("log_level").upper())
    logging.getLogger(STOP_LOG).setLevel(log_level)

# print(logging.Logger.manager.loggerDict) # 取使用debug模块的name

# 日志级别大小关系为:critical > error > warning > info > debug
_log = get_logger(
    name=gloable_setting_dict.get("log_name"),
    path=gloable_setting_dict.get("log_path"),
    log_level=gloable_setting_dict.get("log_level").upper(),
    is_write_to_file=gloable_setting_dict.get("is_write_to_file"),
)

log = _log
Ejemplo n.º 8
0
    async def download(self, request: Request):
        spider = request.__spider__
        max_retry = spider.cutome_setting_dict.get("req_max_retry") or gloable_setting_dict.get(
            "req_max_retry")
        if max_retry <= 0:
            raise ValueError("req_max_retry must >0")
        header_dict = spider.cutome_setting_dict.get("default_headers") or gloable_setting_dict.get(
            "default_headers")
        req_timeout = request.timeout or spider.cutome_setting_dict.get("req_timeout") or gloable_setting_dict.get(
            "req_timeout")
        request.timeout = req_timeout
        header = request.header or {}
        request.header = header.update(header_dict)
        request.header = header
        ignore_response_codes = spider.cutome_setting_dict.get("ignore_response_codes") or gloable_setting_dict.get(
            "ignore_response_codes")
        req_delay = spider.cutome_setting_dict.get("req_delay") or gloable_setting_dict.get("req_delay")
        if request and request.retry >= max_retry:
            # reached max retry times
            self.reminder.go(Reminder.request_dropped, request, scheduler=self.scheduler)
            self.log.error(f'reached max retry times... {request}')
            return
        request.retry = request.retry + 1
        # when canceled
        loop = asyncio.get_running_loop()
        if loop.is_closed() or not loop.is_running():
            self.log.warning(f'loop is closed in download')
            return
        with suppress(asyncio.CancelledError):
            async  with self.semaphore:
                await self._before_fetch(request)

                fetch = self.downer.fetch
                iscoroutinefunction = inspect.iscoroutinefunction(fetch)
                # support sync or async request
                try:
                    # req_delay
                    if req_delay > 0:
                        await asyncio.sleep(req_delay)
                    self.log.info(f"send a request: url: {request.url}")
                    if iscoroutinefunction:
                        response = await fetch(request)
                    else:
                        self.log.debug(f'fetch may be an snyc func  so it will run in executor ')
                        response = await asyncio.get_event_loop() \
                            .run_in_executor(None, fetch, request)
                except TimeoutError as e:
                    # delay retry
                    wait = self.scheduler.schedlue(request)
                    if inspect.isawaitable(wait):
                        await wait
                    self.log.debug(
                        f'req  to fetch is timeout now so this req will dely to sechdule for retry {request.url}')
                    return
                except asyncio.CancelledError as e:
                    self.log.debug(f' task is cancel..')
                    return
                except BaseException as e:
                    self.log.error(f'occured some exception in downloader e:{e}')
                    return
                if response is None or not isinstance(response, Response):
                    self.log.error(
                        f'the downer {self.downer.__class__.__name__} fetch function must return a response,'
                        'that is a no-null response, and response must be a '
                        'smart.Response instance or sub Response instance.  ')
                    return
                self.reminder.go(Reminder.response_downloaded, response)
                if response.status not in ignore_response_codes:
                    await self._after_fetch(request, response)

        if response.status not in ignore_response_codes:
            response.request = request
            response.__spider__ = spider
            await self.response_queue.put(response)
        return response