Esempio n. 1
0
def reset_task() -> list:
    import random
    from src.BusinessCentralLayer.middleware.redis_io import RedisClient
    from src.BusinessCentralLayer.setting import SINGLE_TASK_CAP, REDIS_SECRET_KEY

    rc = RedisClient()
    running_state = dict(zip(CRAWLER_SEQUENCE, [[] for _ in range(len(CRAWLER_SEQUENCE))]))
    action_list = __entropy__.copy()
    qsize = len(action_list)
    random.shuffle(action_list)
    try:
        # 进行各个类型的实体任务的分类
        for task_name in CRAWLER_SEQUENCE:
            # 获取池中对应类型的数据剩余
            storage_remain: int = rc.get_len(REDIS_SECRET_KEY.format(f'{task_name}'))
            # 进行各个类型的实体任务的分类
            for atomic in action_list:
                permission = {} if atomic.get('hyper_params') is None else atomic.get('hyper_params')
                if permission.get(task_name) is True:
                    running_state[task_name].append(atomic)
            # 在库数据溢出 返回空执行队列
            if storage_remain >= SINGLE_TASK_CAP:
                running_state[task_name] = []
            # 缓存+保存数据超过风险阈值
            while storage_remain + qsize > int(SINGLE_TASK_CAP * 0.8):
                if len(running_state[task_name]) < 1:
                    break
                running_state[task_name].pop()
                qsize -= 1

        instances = [atomic for i in list(running_state.values()) if i for atomic in i]
        return instances
    # 网络异常,主动捕获RedisClient()的连接错误
    except ConnectionError:
        return []
Esempio n. 2
0
def sync_actions(
    class_: str,
    mode_sync: str = None,
    only_sync=False,
    beat_sync=True,
):
    """

    @param class_:
    @param mode_sync:  是否同步消息队列。False:同步本机任务队列,True:同步Redis订阅任务
    @param only_sync:
    @param beat_sync:
    @return:
    """
    logger.info(
        f"<TaskManager> Sync{mode_sync.title()} || 正在同步<{class_}>任务队列...")

    # ================================================
    # 节拍停顿 原子同步
    # ================================================
    rc = RedisClient()
    _state = _is_overflow(task_name=class_, rc=rc)
    if _state == 'stop':
        return _state

    # ================================================
    # 更新任务信息
    # ================================================
    # 公示即将发动的采集任务数据
    _update_entropy(rc=rc, entropy=__entropy__)
    # 通由工厂读取映射表批量生产采集器运行实体
    sync_queue: list = ActionShunt(class_, silence=True,
                                   beat_sync=beat_sync).shunt()
    # 打乱任务序列
    random.shuffle(sync_queue)

    # ================================================
    # $执行核心业务
    # ================================================
    if mode_sync == 'upload':
        # fixme:临时方案:解决链接溢出问题
        if round(rc.get_len(REDIS_SECRET_KEY.format(class_)) *
                 1.25) > SINGLE_TASK_CAP:
            logger.warning("<TaskManager> UploadHijack -- 连接池任务即将溢出,上传任务被劫持")
            return None
        # 持续实例化采集任务
        for _ in range(sync_queue.__len__()):
            rc.sync_message_queue(mode='upload', message=class_)
            # 节拍同步线程锁
            if only_sync:
                logger.warning("<TaskManager> OnlySync -- 触发节拍同步线程锁,仅上传一枚原子任务")
                break
        logger.success("<TaskManager> UploadTasks -- 任务上传完毕")
    elif mode_sync == 'download':
        async_queue: list = []
        while True:
            # 获取原子任务
            atomic = rc.sync_message_queue(mode='download')
            # 若原子有效则同步数据
            if atomic and atomic in CRAWLER_SEQUENCE:
                # 判断同步状态
                # 防止过载。当本地缓冲任务即将突破容载极限时停止同步
                # _state 状态有三,continue/offload/stop
                _state = _is_overflow(task_name=atomic, rc=rc)
                if _state != 'continue':
                    return _state
                if async_queue.__len__() == 0:
                    async_queue = ActionShunt(atomic,
                                              silence=True,
                                              beat_sync=beat_sync).shunt()
                    random.shuffle(async_queue)
                # 将采集器实体推送至Poseidon本机消息队列
                Middleware.poseidon.put_nowait(async_queue.pop())
                logger.info(
                    f'<TaskManager> offload atomic<{atomic}>({Middleware.poseidon.qsize()})'
                )
                # 节拍同步线程锁
                if only_sync:
                    logger.warning(
                        f"<TaskManager> OnlySync -- <{atomic}>触发节拍同步线程锁,仅下载一枚原子任务"
                    )
                    return 'offload'
            else:
                return 'offload'
    elif mode_sync == 'force_run':
        for slave_ in sync_queue:
            # ================================================================================================
            # TODO v5.4.r 版本新增特性 scaffold spawn
            # 1. 之前版本中通由scaffold 无论运行 run 还是 force-run 指令都无法在队列满载的情况下启动采集任务
            # 主要原因在于如下几行代码加了锁
            # 2. 通过新增的spawn指令可绕过此模块通由SpawnBooster直接编译底层代码启动采集器
            # ================================================================================================
            # force_run :适用于单机部署或单步调试下
            # 需要确保无溢出风险,故即使是force_run的启动模式,任务执行数也不应逾越任务容载数
            _state = _is_overflow(task_name=class_, rc=rc)
            if _state != 'continue':
                return _state

            # 将采集器实体推送至Poseidon本机消息队列
            Middleware.poseidon.put_nowait(slave_)

            # 节拍同步线程锁
            if only_sync:
                logger.warning(
                    f"<TaskManager> OnlySync -- <{class_}>触发节拍同步线程锁,仅下载一枚原子任务")
                return 'stop'

        return 'offload'