def __init__(self, spider, middlewire=None, pipline: Piplines = None): self.reminder = reminder self.log = log self.lock = None self.task_dict: Dict[str, asyncio.Task] = {} self.pip_task_dict: Dict[str, asyncio.Task] = {} self.spider = spider self.middlewire = middlewire self.piplines = pipline duplicate_filter_class = self._get_dynamic_class_setting("duplicate_filter_class") scheduler_container_class = self._get_dynamic_class_setting("scheduler_container_class") net_download_class = self._get_dynamic_class_setting("net_download_class") self.scheduler = Scheduler(duplicate_filter_class(), scheduler_container_class()) req_per_concurrent = self.spider.cutome_setting_dict.get("req_per_concurrent") or gloable_setting_dict.get( "req_per_concurrent") single = self.spider.cutome_setting_dict.get("is_single") self.is_single = gloable_setting_dict.get("is_single") if single is None else single self.downloader = Downloader(self.scheduler, self.middlewire, reminder=self.reminder, seq=req_per_concurrent, downer=net_download_class()) self.request_generator_queue = deque() self.stop = False self.condition = asyncio.Condition() self.item_queue = asyncio.Queue() pipline_is_paralleled = self.spider.cutome_setting_dict.get("pipline_is_paralleled") pipline_is_paralleled = gloable_setting_dict.get( "pipline_is_paralleled") if pipline_is_paralleled is None else pipline_is_paralleled self.pipline_is_paralleled = pipline_is_paralleled
def __init__(self, spider, middlewire=None, pipline: Piplines = None): self.lock = None self.task_dict: Dict[str, asyncio.Task] = {} self.pip_task_dict: Dict[str, asyncio.Task] = {} self.spider = spider self.middlewire = middlewire self.piplines = pipline self.reminder = reminder duplicate_filter_class = self._get_dynamic_class_setting( "duplicate_filter_class") scheduler_container_class = self._get_dynamic_class_setting( "scheduler_container_class") net_download_class = self._get_dynamic_class_setting( "net_download_class") self.scheduler = Scheduler(duplicate_filter_class(), scheduler_container_class()) req_per_concurrent = self.spider.cutome_setting_dict.get( "req_per_concurrent") or gloable_setting_dict.get( "req_per_concurrent") self.downloader = Downloader(self.scheduler, self.middlewire, seq=req_per_concurrent, reminder=self.reminder, downer=net_download_class()) self.request_generator_queue = deque() self.stop = False self.log = log
def _get_dynamic_class_setting(self, key): class_str = self.spider.cutome_setting_dict.get( key) or gloable_setting_dict.get( key) _module = importlib.import_module(".".join(class_str.split(".")[:-1])) _class = getattr(_module, class_str.split(".")[-1]) return _class
def _get_dynamic_class_setting(self, key): class_str = self.spider.cutome_setting_dict.get( key) or gloable_setting_dict.get(key) _module = importlib.import_module(".".join(class_str.split(".")[:-1])) _class = getattr(_module, class_str.split(".")[-1]) self.log.info( f"dynamic loaded key【{key}】--> class【{class_str}】success") return _class
def __init__(self, loop=None): if sys.platform == "win32": # avoid a certain extent: too many files error loop = loop or asyncio.ProactorEventLoop() else: loop = loop or asyncio.new_event_loop() thread_pool_max_size = gloable_setting_dict.get( "thread_pool_max_size", 30) loop.set_default_executor(ThreadPoolExecutor(thread_pool_max_size)) asyncio.set_event_loop(loop) self.loop = loop self.cores = [] self.log = log self.spider_names = []
def _check_internet_state(self): self.log.info("check internet health") error_msg = "internet may not be available please check net, run ended" net_healthy_check_url = gloable_setting_dict.get( "net_healthy_check_url", None) if net_healthy_check_url is None: return if not is_valid_url(net_healthy_check_url): return try: resp = urlopen(url=net_healthy_check_url, timeout=10) if not 200 <= resp.status <= 299: raise RuntimeError(error_msg) except Exception: raise RuntimeError(error_msg)
"selenium", # markdown "MARKDOWN", "build_extension", # newspaper "calculate_area", "largest_image_url", "newspaper.images", "newspaper", "Importing", "PIL", ] # 关闭日志打印 for STOP_LOG in STOP_LOGS: log_level = eval("logging." + gloable_setting_dict.get("log_level").upper()) logging.getLogger(STOP_LOG).setLevel(log_level) # print(logging.Logger.manager.loggerDict) # 取使用debug模块的name # 日志级别大小关系为:critical > error > warning > info > debug _log = get_logger( name=gloable_setting_dict.get("log_name"), path=gloable_setting_dict.get("log_path"), log_level=gloable_setting_dict.get("log_level").upper(), is_write_to_file=gloable_setting_dict.get("is_write_to_file"), ) log = _log
async def download(self, request: Request): spider = request.__spider__ max_retry = spider.cutome_setting_dict.get("req_max_retry") or gloable_setting_dict.get( "req_max_retry") if max_retry <= 0: raise ValueError("req_max_retry must >0") header_dict = spider.cutome_setting_dict.get("default_headers") or gloable_setting_dict.get( "default_headers") req_timeout = request.timeout or spider.cutome_setting_dict.get("req_timeout") or gloable_setting_dict.get( "req_timeout") request.timeout = req_timeout header = request.header or {} request.header = header.update(header_dict) request.header = header ignore_response_codes = spider.cutome_setting_dict.get("ignore_response_codes") or gloable_setting_dict.get( "ignore_response_codes") req_delay = spider.cutome_setting_dict.get("req_delay") or gloable_setting_dict.get("req_delay") if request and request.retry >= max_retry: # reached max retry times self.reminder.go(Reminder.request_dropped, request, scheduler=self.scheduler) self.log.error(f'reached max retry times... {request}') return request.retry = request.retry + 1 # when canceled loop = asyncio.get_running_loop() if loop.is_closed() or not loop.is_running(): self.log.warning(f'loop is closed in download') return with suppress(asyncio.CancelledError): async with self.semaphore: await self._before_fetch(request) fetch = self.downer.fetch iscoroutinefunction = inspect.iscoroutinefunction(fetch) # support sync or async request try: # req_delay if req_delay > 0: await asyncio.sleep(req_delay) self.log.info(f"send a request: url: {request.url}") if iscoroutinefunction: response = await fetch(request) else: self.log.debug(f'fetch may be an snyc func so it will run in executor ') response = await asyncio.get_event_loop() \ .run_in_executor(None, fetch, request) except TimeoutError as e: # delay retry wait = self.scheduler.schedlue(request) if inspect.isawaitable(wait): await wait self.log.debug( f'req to fetch is timeout now so this req will dely to sechdule for retry {request.url}') return except asyncio.CancelledError as e: self.log.debug(f' task is cancel..') return except BaseException as e: self.log.error(f'occured some exception in downloader e:{e}') return if response is None or not isinstance(response, Response): self.log.error( f'the downer {self.downer.__class__.__name__} fetch function must return a response,' 'that is a no-null response, and response must be a ' 'smart.Response instance or sub Response instance. ') return self.reminder.go(Reminder.response_downloaded, response) if response.status not in ignore_response_codes: await self._after_fetch(request, response) if response.status not in ignore_response_codes: response.request = request response.__spider__ = spider await self.response_queue.put(response) return response