class SubscribesCleaner(CoroutineSpeedup): """解耦清洗插件:国内IP调用很可能出现性能滑坡""" def __init__(self, debug=False, kill_target: str = None): super(SubscribesCleaner, self).__init__() self.debug = debug self.keys = [REDIS_SECRET_KEY.format(s) for s in CRAWLER_SEQUENCE] self.rc = RedisClient().get_driver() self.kill_ = kill_target def offload_task(self): for key_ in self.keys: for sub, _ in self.rc.hgetall(key_).items(): self.work_q.put_nowait([sub, key_]) def _del_subs(self, key_: str, subs: str, err_: str = '') -> None: self.rc.hdel(key_, subs) logger.debug(f'>> Detach -> {subs} -- {err_}') def control_driver(self, sub_info: List[str]): """ @param sub_info: [subs,key_secret_class] @return: """ try: # 解耦指定簇 if self.kill_ and self.kill_ in sub_info[0]: self._del_subs(sub_info[-1], sub_info[0], "target") else: # 解析订阅 node_info: dict = subs2node(sub_info[0], False) # 打印debug信息 if self.debug: print( f"check -- {node_info['subs']} -- {node_info['node'].__len__()}" ) # 订阅解耦 if node_info['node'].__len__() <= 4: self._del_subs(sub_info[-1], sub_info[0], "decouple") except UnicodeDecodeError or TypeError as e: logger.debug( f"Retry put the subscribe({sub_info}) to work queue -- {e}") # 单个链接重试3次,标记超时链接 if self.temp_cache.get(sub_info[0]): self.temp_cache[sub_info[0]] += 1 else: self.temp_cache[sub_info[0]] = 1 if self.temp_cache[sub_info[0]] <= 3: self.work_q.put_nowait(sub_info) else: self._del_subs(sub_info[-1], sub_info[0], e) except SystemExit: logger.critical("请关闭系统代理后再执行订阅清洗操作") except Exception as e: logger.warning(f"{sub_info} -- {e}") self._del_subs(sub_info[-1], sub_info[0])
def detach(subscribe, beat_sync=False): """ @param subscribe: @param beat_sync: 是否立即删除, True:立即删除,False:节拍同步,随ddt删除 @return: """ from faker import Faker from urllib.parse import urlparse # 清洗出订阅中的token token = urlparse(subscribe).path r = RedisClient().get_driver() # 遍历所有任务类型 for task in CRAWLER_SEQUENCE: # 遍历某种类型的链接池 for sub in r.hgetall(REDIS_SECRET_KEY.format(task)).items(): # 匹配用户token if token == urlparse(sub[0]).path: # 若节拍同步,立即移除订阅 if beat_sync: r.hdel(REDIS_SECRET_KEY.format(task), sub[0]) logger.debug(f'>> Detach -> {sub[0]}') # 否则将订阅过期时间标记为过期,该链接将随下一波任一节点的ddt任务被删除 else: r.hset(REDIS_SECRET_KEY.format(task), sub[0], str(Faker().past_datetime())) break
class SubscribesCleaner(CoroutineSpeedup): """解耦清洗插件:国内IP调用很可能出现性能滑坡""" def __init__(self, debug=False, kill_target: str = None): super(SubscribesCleaner, self).__init__() self.debug = debug self.keys = [REDIS_SECRET_KEY.format(s) for s in CRAWLER_SEQUENCE] self.rc = RedisClient().get_driver() self.kill_ = kill_target def offload_task(self): for key_ in self.keys: for sub, _ in self.rc.hgetall(key_).items(): self.work_q.put_nowait([sub, key_]) def _del_subs(self, key_: str, subs: str, err_) -> None: self.rc.hdel(key_, subs) # logger.debug(f'>> Detach -> {subs} -- {err_}') print(Fore.BLUE, f"[{datetime.now()}] detach -> {subs} {err_}") def control_driver(self, sub_info: List[str], threshold: int = 4): """ :param sub_info: [subs,key_secret_class] :param threshold: 解耦置信阈值 小于或等于这个值的订阅将被剔除 :return: """ try: # 针对指定订阅源进行清洗工作 if self.kill_ and self.kill_ in sub_info[0]: self._del_subs(sub_info[-1], sub_info[0], "target active removal") else: # 解析订阅 node_info: dict = subs2node(sub_info[0]) # 订阅解耦 if node_info['node'].__len__() <= threshold: self._del_subs(sub_info[-1], sub_info[0], "decouple active removal") elif self.debug: print( Fore.WHITE, f"[{datetime.now()}] valid -- {node_info['subs']} -- {len(node_info['node'])}" ) except (UnicodeDecodeError, TypeError) as e: # 对于已标记“解析错误”的订阅 更新其请求次数 if self.temp_cache.get(sub_info[0]): self.temp_cache[sub_info[0]] += 1 # 否则标记为“解析错误”的订阅 else: print(Fore.YELLOW, f"[{datetime.now()}] recheck -- {sub_info[0]}") self.temp_cache[sub_info[0]] = 1 # 若链接重试次数少于3次 重添加至任务队列尾部 if self.temp_cache[sub_info[0]] <= 3: self.work_q.put_nowait(sub_info) # 若链接重试次数大于3次 剔除 else: self._del_subs(sub_info[-1], sub_info[0], e) except SystemExit: warnings.warn("请关闭系统代理后部署订阅清洗任务") except Exception as e: logger.warning(f"{sub_info} -- {e}") self._del_subs(sub_info[-1], sub_info[0], e)
class SubscribesCleaner(lsu): """解耦清洗插件:国内IP调用很可能出现性能滑坡""" def __init__(self, debug=False, kill_target: str = None): super(SubscribesCleaner, self).__init__() self.debug = debug self.keys = [REDIS_SECRET_KEY.format(s) for s in CRAWLER_SEQUENCE] self.rc = RedisClient().get_driver() self.kill_ = kill_target def offload_task(self): for key_ in self.keys: for sub, _ in self.rc.hgetall(key_).items(): self.work_q.put_nowait([sub, key_]) def killer(self): """ @todo redis批量移除或移动hash @return: """ if self.apollo: for kill_ in self.apollo: self.rc.hdel(kill_[0], kill_[-1]) logger.debug(f'>> Detach -> {kill_[-1]}') def control_driver(self, sub_info: List[str]): """ @param sub_info: [subs,key_secret_class] @return: """ try: # 解耦指定簇 if self.kill_ and self.kill_ in sub_info[0]: self.apollo.append([sub_info[-1], sub_info[0]]) else: # 解析订阅 node_info: dict = subs2node(sub_info[0], False) # 打印debug信息 if self.debug: print( f"check -- {node_info['subs']} -- {node_info['node'].__len__()}" ) # 订阅解耦 if node_info['node'].__len__() <= 3: self.apollo.append([sub_info[-1], sub_info[0]]) except UnicodeDecodeError or TypeError as e: logger.debug( f"Retry put the subscribe({sub_info}) to work queue -- {e}") # 单个链接重试3次,标记超时链接 if self.temp_cache.get(sub_info[0]): self.temp_cache[sub_info[0]] += 1 else: self.temp_cache[sub_info[0]] = 1 if self.temp_cache[sub_info[0]] <= 3: self.work_q.put_nowait(sub_info) else: self.apollo.append([sub_info[-1], sub_info[0]]) except Exception as e: logger.warning(f"{sub_info} -- {e}") self.apollo.append([sub_info[-1], sub_info[0]])
class SubscribesCleaner(CoroutineSpeedup): """解耦清洗插件:国内IP调用很可能出现性能滑坡""" def __init__(self, debug=False, kill_target: str = None): super(SubscribesCleaner, self).__init__() self.debug = debug self.keys = [REDIS_SECRET_KEY.format(s) for s in CRAWLER_SEQUENCE] self.rc = RedisClient().get_driver() self.kill_ = kill_target def offload_task(self): for key_ in self.keys: try: for sub, _ in self.rc.hgetall(key_).items(): self.work_q.put_nowait([sub, key_]) except redis_error.ResponseError: logger.critical("Link pool is broken down.") def _del_subs(self, key_: str, subs: str, err_) -> None: try: self.rc.hdel(key_, subs) terminal_echo(f"detach -> {subs} {err_}", 3) except redis_error.ConnectionError: logger.critical( "<SubscribeCleaner> The local network communication is abnormal." ) def control_driver(self, sub_info: List[str], threshold: int = 4): """ :param sub_info: [subs,key_secret_class] :param threshold: 解耦置信阈值 小于或等于这个值的订阅将被剔除 :return: """ super(SubscribesCleaner, self).control_driver(task=sub_info) try: # 针对指定订阅源进行清洗工作 if self.kill_ and self.kill_ in sub_info[0]: self._del_subs(sub_info[-1], sub_info[0], "target active removal") else: # 解析订阅 node_info: dict = subs2node(sub_info[0]) # 订阅解耦 if node_info['node'].__len__() <= threshold: self._del_subs(sub_info[-1], sub_info[0], "decouple active removal") elif self.debug: terminal_echo( f"valid -- {node_info['subs']} -- {len(node_info['node'])}", 1) except (UnicodeDecodeError, TypeError) as e: # 对于已标记“解析错误”的订阅 更新其请求次数 if self.temp_cache.get(sub_info[0]): self.temp_cache[sub_info[0]] += 1 # 否则标记为“解析错误”的订阅 else: terminal_echo(f"recheck -- {sub_info[0]}", 2) self.temp_cache[sub_info[0]] = 1 # 若链接重试次数少于3次 重添加至任务队列尾部 if self.temp_cache[sub_info[0]] <= 3: self.work_q.put_nowait(sub_info) # 若链接重试次数大于3次 剔除 else: self._del_subs(sub_info[-1], sub_info[0], e) except SystemExit: warnings.warn("请关闭系统代理后部署订阅清洗任务") except Exception as e: logger.warning(f"{sub_info} -- {e}") self._del_subs(sub_info[-1], sub_info[0], e) def killer(self): if not self.debug: logger.success("<SubscribesCleaner> --> decouple compete.")