def _sync_actions( class_: str, mode_sync: str = None, only_sync=False, beat_sync=True, ): """ @param class_: @param mode_sync: 是否同步消息队列。False:同步本机任务队列,True:同步Redis订阅任务 @param only_sync: @param beat_sync: @return: """ logger.info( f"<TaskManager> Sync{mode_sync.title()} || 正在同步<{class_}>任务队列...") # TODO 原子化同步行为 rc = RedisClient() # 节拍停顿 _state = _is_overflow(task_name=class_, rc=rc) if _state == 'stop': return _state sync_queue: list = ActionShunt(class_, silence=True, beat_sync=beat_sync).shunt() random.shuffle(sync_queue) # 在本机环境中生成任务并加入消息队列 if mode_sync == 'upload': # fixme:临时方案:解决链接溢出问题 if round(rc.__len__(REDIS_SECRET_KEY.format(class_)) * 1.25) > SINGLE_TASK_CAP: logger.warning("<TaskManager> UploadHijack -- 连接池任务即将溢出,上传任务被劫持") return None # 持续实例化采集任务 for _ in range(sync_queue.__len__()): rc.sync_message_queue(mode='upload', message=class_) # 节拍同步线程锁 if only_sync: logger.warning("<TaskManager> OnlySync -- 触发节拍同步线程锁,仅上传一枚原子任务") break logger.success("<TaskManager> UploadTasks -- 任务上传完毕") # 同步分布式消息队列的任务 elif mode_sync == 'download': async_queue: list = [] while True: # 获取原子任务 atomic = rc.sync_message_queue(mode='download') # 若原子有效则同步数据 if atomic and atomic in CRAWLER_SEQUENCE: # 判断同步状态 # 防止过载。当本地缓冲任务即将突破容载极限时停止同步 # _state 状态有三,continue/offload/stop _state = _is_overflow(task_name=atomic, rc=rc) if _state != 'continue': return _state if async_queue.__len__() == 0: async_queue = ActionShunt(atomic, silence=True, beat_sync=beat_sync).shunt() random.shuffle(async_queue) # 将执行语句推送至Poseidon本机消息队列 Middleware.poseidon.put_nowait(async_queue.pop()) logger.info( f'<TaskManager> offload atomic<{atomic}>({Middleware.poseidon.qsize()})' ) # 节拍同步线程锁 if only_sync: logger.warning( f"<TaskManager> OnlySync -- <{atomic}>触发节拍同步线程锁,仅下载一枚原子任务" ) return 'offload' # 否则打印警告日志并提前退出同步 else: # logger.warning(f"<TaskManager> SyncFinish -- <{atomic}>无可同步任务") return 'offload' elif mode_sync == 'force_run': for slave_ in sync_queue: # force_run :适用于单机部署或单步调试下 # 需要确保无溢出风险,故即使是force_run的启动模式,任务执行数也不应逾越任务容载数 _state = _is_overflow(task_name=class_, rc=rc) if _state != 'continue': return _state # 将执行语句推送至Poseidon本机消息队列 Middleware.poseidon.put_nowait(slave_) # 在force_run模式下仍制约于节拍同步线程锁 # 此举服务于主机的订阅补充操作 # 优先级更高,不受队列可用容载影响强制中断同步操作 if only_sync: logger.warning( f"<TaskManager> OnlySync -- <{class_}>触发节拍同步线程锁,仅下载一枚原子任务") return 'stop' return 'offload'
def apis_admin_get_entropy() -> list: return RedisClient().get_driver().get( REDIS_SECRET_KEY.format("__entropy__")).split("$")
def __init__(self, debug=False, kill_target: str = None): super(SubscribesCleaner, self).__init__() self.debug = debug self.keys = [REDIS_SECRET_KEY.format(s) for s in CRAWLER_SEQUENCE] self.rc = RedisClient().get_driver() self.kill_ = kill_target
def select_subs_to_admin(select_netloc: str = None, _debug=False) -> dict: # 池内所有类型订阅 remain_subs = [] # 订阅池状态映射表 mapping_subs_status = {} # 链接-类型映射表 mapping_subs_type = {} rc = RedisClient() # 清洗数据 for filed in CRAWLER_SEQUENCE: # 提取池内对应类型的所有订阅链接 filed_subs: list = RedisClient().sync_remain_subs( REDIS_SECRET_KEY.format(filed)) # 更新汇总队列 remain_subs += filed_subs # 提取subs netloc映射区间 urls = [urlparse(i[0]).netloc for i in filed_subs] # 更新映射表 mapping_subs_status.update({filed: dict(Counter(urls))}) mapping_subs_type.update( zip([i[0] for i in filed_subs], [ filed, ] * len(filed_subs))) # 初始化状态下,返回订阅池状态 if not select_netloc: rc.update_api_status(api_name="search", date_format=str(datetime.now(TIME_ZONE_CN))) return {'msg': 'success', 'info': mapping_subs_status} for tag in remain_subs: # 提取信息键 subscribe, end_life = tag[0], tag[-1] # 存在对应netloc的链接并可存活至少beyond小时 if select_netloc in urlparse(subscribe).netloc and not RedisClient( ).is_stale(end_life, beyond=6): logger.debug("<SubscribeIO> -- GET SUBSCRIPTION") rc.update_api_status(api_name="get", date_format=str(datetime.now(TIME_ZONE_CN))) try: return { 'msg': "success", 'debug': _debug, 'info': { "subscribe": subscribe, "endLife": end_life, 'subsType': mapping_subs_type[subscribe], "netloc": select_netloc } } finally: if not _debug: threading.Thread(target=detach, kwargs={ "subscribe": subscribe, 'beat_sync': True }).start() # 无库存或误码 return { 'msg': "failed", "netloc": select_netloc, "info": "指令错误或不存在该类型订阅", "status": mapping_subs_status }