def reset_task() -> list: import random from src.BusinessCentralLayer.middleware.redis_io import RedisClient from src.BusinessCentralLayer.setting import SINGLE_TASK_CAP, REDIS_SECRET_KEY rc = RedisClient() running_state = dict(zip(CRAWLER_SEQUENCE, [[] for _ in range(len(CRAWLER_SEQUENCE))])) action_list = __entropy__.copy() qsize = len(action_list) random.shuffle(action_list) try: # 进行各个类型的实体任务的分类 for task_name in CRAWLER_SEQUENCE: # 获取池中对应类型的数据剩余 storage_remain: int = rc.get_len(REDIS_SECRET_KEY.format(f'{task_name}')) # 进行各个类型的实体任务的分类 for atomic in action_list: permission = {} if atomic.get('hyper_params') is None else atomic.get('hyper_params') if permission.get(task_name) is True: running_state[task_name].append(atomic) # 在库数据溢出 返回空执行队列 if storage_remain >= SINGLE_TASK_CAP: running_state[task_name] = [] # 缓存+保存数据超过风险阈值 while storage_remain + qsize > int(SINGLE_TASK_CAP * 0.8): if len(running_state[task_name]) < 1: break running_state[task_name].pop() qsize -= 1 instances = [atomic for i in list(running_state.values()) if i for atomic in i] return instances # 网络异常,主动捕获RedisClient()的连接错误 except ConnectionError: return []
def sync_actions( class_: str, mode_sync: str = None, only_sync=False, beat_sync=True, ): """ @param class_: @param mode_sync: 是否同步消息队列。False:同步本机任务队列,True:同步Redis订阅任务 @param only_sync: @param beat_sync: @return: """ logger.info( f"<TaskManager> Sync{mode_sync.title()} || 正在同步<{class_}>任务队列...") # ================================================ # 节拍停顿 原子同步 # ================================================ rc = RedisClient() _state = _is_overflow(task_name=class_, rc=rc) if _state == 'stop': return _state # ================================================ # 更新任务信息 # ================================================ # 公示即将发动的采集任务数据 _update_entropy(rc=rc, entropy=__entropy__) # 通由工厂读取映射表批量生产采集器运行实体 sync_queue: list = ActionShunt(class_, silence=True, beat_sync=beat_sync).shunt() # 打乱任务序列 random.shuffle(sync_queue) # ================================================ # $执行核心业务 # ================================================ if mode_sync == 'upload': # fixme:临时方案:解决链接溢出问题 if round(rc.get_len(REDIS_SECRET_KEY.format(class_)) * 1.25) > SINGLE_TASK_CAP: logger.warning("<TaskManager> UploadHijack -- 连接池任务即将溢出,上传任务被劫持") return None # 持续实例化采集任务 for _ in range(sync_queue.__len__()): rc.sync_message_queue(mode='upload', message=class_) # 节拍同步线程锁 if only_sync: logger.warning("<TaskManager> OnlySync -- 触发节拍同步线程锁,仅上传一枚原子任务") break logger.success("<TaskManager> UploadTasks -- 任务上传完毕") elif mode_sync == 'download': async_queue: list = [] while True: # 获取原子任务 atomic = rc.sync_message_queue(mode='download') # 若原子有效则同步数据 if atomic and atomic in CRAWLER_SEQUENCE: # 判断同步状态 # 防止过载。当本地缓冲任务即将突破容载极限时停止同步 # _state 状态有三,continue/offload/stop _state = _is_overflow(task_name=atomic, rc=rc) if _state != 'continue': return _state if async_queue.__len__() == 0: async_queue = ActionShunt(atomic, silence=True, beat_sync=beat_sync).shunt() random.shuffle(async_queue) # 将采集器实体推送至Poseidon本机消息队列 Middleware.poseidon.put_nowait(async_queue.pop()) logger.info( f'<TaskManager> offload atomic<{atomic}>({Middleware.poseidon.qsize()})' ) # 节拍同步线程锁 if only_sync: logger.warning( f"<TaskManager> OnlySync -- <{atomic}>触发节拍同步线程锁,仅下载一枚原子任务" ) return 'offload' else: return 'offload' elif mode_sync == 'force_run': for slave_ in sync_queue: # ================================================================================================ # TODO v5.4.r 版本新增特性 scaffold spawn # 1. 之前版本中通由scaffold 无论运行 run 还是 force-run 指令都无法在队列满载的情况下启动采集任务 # 主要原因在于如下几行代码加了锁 # 2. 通过新增的spawn指令可绕过此模块通由SpawnBooster直接编译底层代码启动采集器 # ================================================================================================ # force_run :适用于单机部署或单步调试下 # 需要确保无溢出风险,故即使是force_run的启动模式,任务执行数也不应逾越任务容载数 _state = _is_overflow(task_name=class_, rc=rc) if _state != 'continue': return _state # 将采集器实体推送至Poseidon本机消息队列 Middleware.poseidon.put_nowait(slave_) # 节拍同步线程锁 if only_sync: logger.warning( f"<TaskManager> OnlySync -- <{class_}>触发节拍同步线程锁,仅下载一枚原子任务") return 'stop' return 'offload'