Esempio n. 1
0
    def sync_message_queue(self, mode: str, message: str = None):
        """
        同步消息队列,此处暂时仅用于任务队列的同步,
        @todo 数据结构预设为List,每次download获取(弹出)一个元素,
        @todo 读写分离,向 master上传,通过订阅同步到集群,slave从集群读入
        @param message: 消息队列容器
        @param mode:
        @return:
        """

        # 发布任务订阅任务
        if mode == "upload":
            if message:
                self.db.lpush("Poseidon", message)
                # logger.info(f"<RedisClient> UploadTask || message")
                return True
            else:
                logger.warning(f"<RedisClient> EmptyTask || 要上传的消息载体为空")
                return False
        # 同步任务队列,下载原子任务
        elif mode == "download":
            if self.db.exists("Poseidon"):
                return self.db.lpop("Poseidon")
            else:
                return False
Esempio n. 2
0
 def sync_launch_interval() -> dict:
     # 读取配置文件
     launch_interval = LAUNCH_INTERVAL
     # 检查配置并返回修正过后的任务配置
     for task_name, task_interval in launch_interval.items():
         # 未填写或填写异常数字
         if (not task_interval) or (task_interval <= 1):
             logger.critical(
                 f"<launch_interval>--{task_name}设置出现致命错误,即将熔断线程。间隔为空或小于1")
             raise Exception
         # 填写浮点数
         if not isinstance(task_interval, int):
             logger.warning(
                 f"<launch_interval>--{task_name}任务间隔应为整型int,参数已拟合")
             # 尝试类型转换若不中则赋一个默认值 60s
             try:
                 launch_interval.update({task_name: int(task_interval)})
             except TypeError:
                 launch_interval.update({task_name: 60})
         # 填写过小的任务间隔数,既设定的发动频次过高,主动拦截并修正为最低容错 60s/run
         if task_interval < 60:
             logger.warning(
                 f"<launch_interval>--{task_name}任务频次过高,应不少于60/次,参数已拟合")
             launch_interval.update({task_name: 60})
     else:
         return launch_interval
Esempio n. 3
0
    def __init__(self):
        super(RedisDataDisasterTolerance, self).__init__()

        from src.BusinessCentralLayer.setting import REDIS_SLAVER_DDT
        if not REDIS_SLAVER_DDT.get('host'):
            logger.warning('未设置数据容灾服务器,该职能将由Master执行')
            # 拷贝参数
            redis_virtual = REDIS_MASTER
            # 改动浅拷贝数据库
            redis_virtual.update({'db': redis_virtual['db'] + 1})
            logger.debug("备份重定向 --> {}".format(redis_virtual))
        else:
            redis_virtual = REDIS_SLAVER_DDT
        # 容器初始化
        self.docker = {}
        try:
            self.acm = RedisClient(host=redis_virtual['host'],
                                   port=redis_virtual['port'],
                                   password=redis_virtual['password'])
            logger.info("DDT: Master({}) -> Slaver({})".format(
                REDIS_MASTER['host'], redis_virtual['host']))
        except redis.exceptions.ConnectionError as e:
            logger.exception(e)
        finally:
            self.redis_virtual = redis_virtual
Esempio n. 4
0
 def run(self, api=None):
     logger.debug(
         f">> RUN <{self.action_name}> --> beat_sync[{self.beat_sync}] feature[General]"
     )
     # 获取任务设置
     api = self.set_spider_option() if api is None else api
     # 执行核心业务逻辑
     try:
         # 设置弹性计时器,当目标站点未能在规定时间内渲染到预期范围时自主销毁实体
         # 防止云彩姬误陷“战局”被站长过肩摔
         self.get_html_handle(api=api,
                              url=self.register_url,
                              wait_seconds=45)
         # 注册账号
         self.sign_up(api)
         # 进入站点并等待核心元素渲染完成
         self.wait(api, 40, "//div[@class='card-body']")
         # 根据原子类型订阅的优先顺序 依次捕获
         self.capture_subscribe(api)
     except TimeoutException:
         logger.error(
             f'>>> TimeoutException <{self.action_name}> -- {self.register_url}'
         )
     except WebDriverException as e:
         logger.error(f">>> WebDriverException <{self.action_name}> -- {e}")
     except (HTTPError, ConnectionRefusedError, ConnectionResetError):
         pass
     except Exception as e:
         logger.warning(f">>> Exception <{self.action_name}> -- {e}")
     finally:
         api.quit()
Esempio n. 5
0
 def deploy_jobs(self):
     try:
         for docker in self.dockers:
             # 添加任务
             job = self.scheduler_.add_job(
                 func=docker['api'],
                 trigger=IntervalTrigger(
                     seconds=self.interval_[docker['name']]),
                 id=docker['name'],
                 # 定時抖動
                 jitter=5,
                 # 定任务设置最大实例并行数
                 max_instances=16,
                 # 把多个排队执行的同一个哑弹任务变成一个
                 coalesce=True,
             )
             self.jobs.append(job)
             # 打印日志
             logger.info(
                 f'<BlockingScheduler> Add job -- <{docker["name"]}>'
                 f' IntervalTrigger: {self.interval_[docker["name"]]}s')
         # 启动任务
         self.scheduler_.start()
     except KeyboardInterrupt:
         self.scheduler_.shutdown(wait=False)
         logger.warning(
             "<BlockingScheduler> The admin forcibly terminated the scheduled task"
         )
     except Exception as err:
         logger.exception(f'<BlockingScheduler>||{err}')
Esempio n. 6
0
 def ddt(task_name: str = None):
     if not task_name:
         _cd.startup_ddt_overdue()
     elif not (isinstance(task_name, str)
               and task_name in CRAWLER_SEQUENCE):
         logger.warning(
             "<Interface>传入的参数(task_name)不合法,任务类型必须被指定在CRAWLER_SEQUENCE之中")
     else:
         _cd.startup_ddt_overdue(task_name)
Esempio n. 7
0
    def run_deploy() -> None:
        """
        定时任务,建议使用if而非for构造任务线程池
        @return:
        """
        # 载入定时任务权限配置
        tasks = ENABLE_DEPLOY['tasks']
        task2function = {
            'ddt_decouple': _cd.startup_ddt_decouple,
            'ddt_overdue': _cd.startup_ddt_overdue,
        }
        try:

            # 初始化调度器
            docker_of_based_scheduler = TasksScheduler()
            docker_of_collector_scheduler = CollectorScheduler()
            # 清洗配置 使调度间隔更加合理
            interval = _cd.sync_launch_interval()
            # 添加任务
            for docker_name, permission in tasks.items():
                logger.info(
                    f"[Job] {docker_name} -- interval: {interval[docker_name]}s -- run: {permission}"
                )
                # 若开启采集器则使用CollectorScheduler映射任务
                # 使用久策略将此分流判断注释既可
                if docker_name == "collector":
                    docker_of_collector_scheduler.mapping_config({
                        'interval':
                        interval[docker_name],
                        'permission':
                        permission,
                    })
                    continue
                if permission:
                    docker_of_based_scheduler.add_job({
                        "name":
                        docker_name,
                        "api":
                        task2function[docker_name],
                        'interval':
                        interval[docker_name],
                        'permission':
                        True
                    })
            # 启动定时任务 要求执行采集任务时必须至少携带另一种其他部署任务
            docker_of_collector_scheduler.deploy_jobs()
            docker_of_based_scheduler.deploy_jobs()
        except ConnectionError:
            logger.warning(
                "<RedisIO> Network communication failure, please check the network connection."
            )
        except KeyError:
            logger.critical(f'config中枢层配置被篡改,ENABLE_DEPLOY 配置中无对应键值对{tasks}')
            sys.exit()
        except NameError:
            logger.critical('eval()或exec()语法异常,检测变量名是否不一致。')
Esempio n. 8
0
    def run(self):
        try:
            if [cq for cq in reversed(self.root) if not os.path.exists(cq)]:
                logger.warning('系统文件残缺!')
                logger.debug("启动<工程重构>模块...")
                self.set_up_file_tree(self.root)
            self.check_config()

        finally:
            if self.flag:
                logger.success(">>> 运行环境链接完成,请重启项目")
                logger.warning(">>> 提醒您正确配置Chrome及对应版本的ChromeDriver")
                sys.exit()
Esempio n. 9
0
def send_email(msg, to_: List[str] or str or set, headers: str = None):
    """
    发送运维信息,该函数仅用于发送简单文本信息
    :param msg: 正文内容
    :param to_: 发送对象
                1. str
                    to_ == 'self',发送给“自己”
                2. List[str]
                    传入邮箱列表,群发邮件(内容相同)。
    :param headers:
    :@todo 加入日志读取功能(open file)以及富文本信息功能(html邮件)
    :return: 默认为'<V2Ray云彩姬>运维日志'
    """
    headers = headers if headers else '<V2Ray云彩姬>运维日志'
    sender = SMTP_ACCOUNT.get('email')
    password = SMTP_ACCOUNT.get('sid')
    smtp_server = 'smtp.qq.com'
    message = MIMEText(msg, 'plain', 'utf-8')
    message['From'] = Header('ARAI.DM', 'utf-8')  # 发送者
    message['Subject'] = Header(f"{headers}", 'utf-8')
    server = smtplib.SMTP_SSL(smtp_server, 465)

    # 输入转换
    if to_ == 'self':
        to_ = set(sender, )
    if isinstance(to_, str):
        to_ = [
            to_,
        ]
    if isinstance(to_, list):
        to_ = set(to_)
    if not isinstance(to_, set):
        return False

    try:
        server.login(sender, password)
        for to in to_:
            try:
                message['To'] = Header(to, 'utf-8')  # 接收者
                server.sendmail(sender, to, message.as_string())
                logger.success("发送成功->{}".format(to))
            except smtplib.SMTPRecipientsRefused:
                logger.warning('邮箱填写错误或不存在->{}'.format(to))
            except Exception as e:
                logger.error('>>> 发送失败 || {}'.format(e))
    finally:
        server.quit()
Esempio n. 10
0
    def control_driver(self, sub_info: List[str], threshold: int = 4):
        """

        :param sub_info: [subs,key_secret_class]
        :param threshold: 解耦置信阈值 小于或等于这个值的订阅将被剔除
        :return:
        """
        try:
            # 针对指定订阅源进行清洗工作
            if self.kill_ and self.kill_ in sub_info[0]:
                self._del_subs(sub_info[-1], sub_info[0],
                               "target active removal")
            else:
                # 解析订阅
                node_info: dict = subs2node(sub_info[0])
                # 订阅解耦
                if node_info['node'].__len__() <= threshold:
                    self._del_subs(sub_info[-1], sub_info[0],
                                   "decouple active removal")
                elif self.debug:
                    print(
                        Fore.WHITE,
                        f"[{datetime.now()}] valid -- {node_info['subs']} -- {len(node_info['node'])}"
                    )

        except (UnicodeDecodeError, TypeError) as e:
            # 对于已标记“解析错误”的订阅 更新其请求次数
            if self.temp_cache.get(sub_info[0]):
                self.temp_cache[sub_info[0]] += 1
            # 否则标记为“解析错误”的订阅
            else:
                print(Fore.YELLOW,
                      f"[{datetime.now()}] recheck -- {sub_info[0]}")
                self.temp_cache[sub_info[0]] = 1
            # 若链接重试次数少于3次 重添加至任务队列尾部
            if self.temp_cache[sub_info[0]] <= 3:
                self.work_q.put_nowait(sub_info)
            # 若链接重试次数大于3次 剔除
            else:
                self._del_subs(sub_info[-1], sub_info[0], e)
        except SystemExit:
            warnings.warn("请关闭系统代理后部署订阅清洗任务")
        except Exception as e:
            logger.warning(f"{sub_info} -- {e}")
            self._del_subs(sub_info[-1], sub_info[0], e)
Esempio n. 11
0
 def _sync_launch_interval() -> dict:
     # 热读取配置文件
     launch_interval = LAUNCH_INTERVAL
     for check in launch_interval.items():
         if not check[-1] or check[-1] <= 1:
             logger.critical(
                 f"<launch_interval>--{check[0]}设置出现致命错误,即将熔断线程。间隔为空或小于1")
             raise Exception
         if not isinstance(check[-1], int):
             logger.warning(
                 f"<launch_interval>--{check[0]}任务间隔应为整型int,参数已拟合")
             launch_interval.update({check[0]: round(check[-1])})
         if check[-1] < 60:
             logger.warning(
                 f"<launch_interval>--{check[0]}任务频次过高,应不少于60/次,参数已拟合")
             launch_interval.update({check[0]: 60})
     else:
         return launch_interval
Esempio n. 12
0
    def collector(self,
                  silence: bool = True,
                  debug: bool = False,
                  page_num: int = 26,
                  sleep_node: int = 5):
        """
        STAFF site collector

        Use Selenium to obtain small batch samples through Google Search Engine
        (according to statistics, there are about 245 legal sites worldwide)

        The collection principle is roughly as follows:
        Use the characteristic word SEO to detect whether the target site exists `/staff` page content.

        :param silence: True无头启动(默认),False显示启动(请仅在调试时启动)
        :param debug:
        :param page_num: 采集“页数”,一页约10条检索结果,依上文所述,此处page_num设置为26左右
        :param sleep_node: 每采集多少页进行一次随机时长的休眠,默认sleep_node为5
        :return:
        """
        logger.info(
            f"Successfully obtained interface permissions -> {StaffCollector.__name__}"
        )

        try:
            # 采集器实例化
            StaffCollector(
                silence=silence,
                # cache_path 为采集到的站点链接输出目录
                cache_path=self._cache_path_staff_hosts,
                chromedriver_path=CHROMEDRIVER_PATH,
                debug=debug).run(page_num=page_num, sleep_node=sleep_node)
        except CollectorSwitchError:
            logger.error(
                "<StaffCollector> Traffic interception is detected, and the system is taking a backup plan"
            )
        except IndexError:
            logger.warning(
                "<StaffCollector> An error occurred while switching the page number"
            )
        except NoSuchWindowException:
            logger.error("<StaffCollector> The Chromedriver exited abnormally")
        except Exception as e:
            logger.exception(f"<StaffCollector> {e}")
Esempio n. 13
0
    def control_driver(self, sub_info: List[str]):
        """

        @param sub_info: [subs,key_secret_class]
        @return:
        """
        try:
            # 解耦指定簇
            if self.kill_ and self.kill_ in sub_info[0]:
                self._del_subs(sub_info[-1], sub_info[0], "target")

            else:
                # 解析订阅
                node_info: dict = subs2node(sub_info[0], False)
                # 打印debug信息
                if self.debug:
                    print(
                        f"check -- {node_info['subs']} -- {node_info['node'].__len__()}"
                    )
                # 订阅解耦
                if node_info['node'].__len__() <= 4:
                    self._del_subs(sub_info[-1], sub_info[0], "decouple")

        except UnicodeDecodeError or TypeError as e:
            logger.debug(
                f"Retry put the subscribe({sub_info}) to work queue -- {e}")

            # 单个链接重试3次,标记超时链接
            if self.temp_cache.get(sub_info[0]):
                self.temp_cache[sub_info[0]] += 1
            else:
                self.temp_cache[sub_info[0]] = 1
            if self.temp_cache[sub_info[0]] <= 3:
                self.work_q.put_nowait(sub_info)
            else:
                self._del_subs(sub_info[-1], sub_info[0], e)

        except SystemExit:
            logger.critical("请关闭系统代理后再执行订阅清洗操作")
        except Exception as e:
            logger.warning(f"{sub_info} -- {e}")
            self._del_subs(sub_info[-1], sub_info[0])
Esempio n. 14
0
    def refresh(self, key_name: str, cross_threshold: int = None) -> None:
        """
        原子级链接池刷新,一次性删去所有过期的key_name subscribe
        @param cross_threshold: 越过阈值删除订阅
        @param key_name:secret_key
        @return:
        """

        docker: dict = self.db.hgetall(key_name)
        # 管理员指令获取的链接
        if self.get_len(key_name) != 0:
            for subscribe, end_life in docker.items():
                if self.is_stale(end_life, cross_threshold):
                    logger.debug(f'del-({key_name})--{subscribe}')
                    self.db.hdel(key_name, subscribe)
            logger.success('<{}> UPDATE - {}({})'.format(
                self.__class__.__name__, key_name, self.get_len(key_name)))
        else:
            logger.warning('<{}> EMPTY - {}({})'.format(
                self.__class__.__name__, key_name, self.get_len(key_name)))
Esempio n. 15
0
    def check_config(call_driver: bool = False):
        chromedriver_not_found_error = "<ScaffoldGuider> ForceRun || ChromedriverNotFound ||" \
                                       "未查找到chromedriver驱动,请根据技术文档正确配置\n" \
                                       ">>> https://github.com/QIN2DIM/V2RayCloudSpider"

        # if not all(SMTP_ACCOUNT.values()):
        #     logger.warning('您未正确配置<通信邮箱>信息(SMTP_ACCOUNT)')
        # if not SERVERCHAN_SCKEY:
        #     logger.warning("您未正确配置<Server酱>的SCKEY")
        if not all(
            [REDIS_SLAVER_DDT.get("host"),
             REDIS_SLAVER_DDT.get("password")]):
            logger.warning('您未正确配置<Redis-Slave> 本项目资源拷贝功能无法使用,但不影响系统正常运行。')
        if not all([REDIS_MASTER.get("host"), REDIS_MASTER.get("password")]):
            logger.error("您未正确配置<Redis-Master> 此配置为“云彩姬”的核心组件,请配置后重启项目!")
            sys.exit()

        # 当需要调用的接口涉及到driver操作时抛出
        if call_driver and not os.path.exists(CHROMEDRIVER_PATH):
            logger.error(chromedriver_not_found_error)
            sys.exit()
Esempio n. 16
0
    def push_info(self, user: dict or List[dict]):
        if isinstance(user, dict):
            user = [user, ]
        elif not isinstance(user, list):
            logger.warning('MySQL add_user 调用格式有误')

        try:
            for user_ in user:
                try:
                    sql = f'INSERT INTO v2raycs (' \
                          f'domain, subs, class_,end_life,res_time,passable,username,password,email,uuid) VALUES (' \
                          f'%s, %s, %s,%s, %s, %s,%s, %s, %s,%s)'
                    val = (user_["domain"], user_["subs"], user_['class_'], user_['end_life'], user_["res_time"],
                           user_['passable'], user_['username'], user_["password"], user_['email'], user_['uuid'])
                    self.cursor.execute(sql, val)
                except KeyError as e:
                    logger.warning(f"MySQL数据解析出错,user:dict必须同时包含username、password以及email的键值对{e}")
                    # return 702
                except pymysql.err.IntegrityError as e:
                    logger.warning(f'{user_["username"]} -- 用户已在库,若需修改用户信息,请使用更新指令{e}')
                    # return 701
                else:
                    logger.success(f'{user_["username"]} -- 用户添加成功')
                    # return 700
        finally:
            self.conn.commit()
            self.conn.close()
Esempio n. 17
0
def _is_overflow(task_name: str, rc=None):
    """
    判断当前缓存是否已达单机采集极限
    @param task_name: class_
    @param rc: RedisClient Object Driver API
    @return:
        --stop: 停止任务同步并结束本轮采集任务
        --offload:停止任务同步并开始执行采集任务
        --continue:继续同步任务
    """

    # TODO 将缓存操作原子化
    cap: int = SINGLE_TASK_CAP

    # 获取当前仓库剩余
    storage_remain: int = rc.__len__(REDIS_SECRET_KEY.format(f'{task_name}'))

    # 获取本机任务缓存
    cache_size: int = Middleware.poseidon.qsize()

    # 判断任务队列是否达到满载状态或已溢出
    if storage_remain >= cap:
        logger.warning(
            f'<TaskManager> OverFlow || 任务溢出<{task_name}>({storage_remain}/{cap})'
        )
        return 'stop'

    # 判断缓冲队列是否已达单机采集极限
    # 未防止绝对溢出,此处限制单机任务数不可超过满载值的~x%
    # x = 1 if signal collector else x = 1/sum (Number of processes)
    elif storage_remain + cache_size > round(cap * 0.8):
        # 若已达或超过单机采集极限,则休眠任务
        logger.info(
            f'<TaskManager> BeatPause || 节拍停顿<{task_name}>({storage_remain + cache_size}/{cap})'
        )
        return 'offload'

    # 否则可以继续同步任务
    else:
        return 'continue'
Esempio n. 18
0
    def run(self, class_: str) -> None:
        """
        Data disaster tolerance or one class_
        @param class_: subscribe type  `ssr` or `v2ray` or `trojan` ...
        @return:
        """

        key_name = REDIS_SECRET_KEY.format(class_)
        self.refresh(key_name, cross_threshold=6)

        # 数据拷贝  ... -> self
        for subscribe, end_life in self.db.hgetall(key_name).items():
            self.docker.update({subscribe: end_life})
            # logger.info("{} {}".format(key_name, subscribe))

        # 映射迁移  acm <- ...
        try:
            self.acm.get_driver().hset(key_name, mapping=self.docker)
        except redis.exceptions.DataError:
            logger.warning(f'({class_}):缓存可能被击穿或缓存为空,请系统管理员及时维护链接池!')
        except redis.exceptions.ConnectionError:
            logger.error(f"redis-slave {self.redis_virtual} 或宕机")
Esempio n. 19
0
def sync_actions(
    class_: str,
    mode_sync: str = None,
    only_sync=False,
    beat_sync=True,
):
    """

    @param class_:
    @param mode_sync:  是否同步消息队列。False:同步本机任务队列,True:同步Redis订阅任务
    @param only_sync:
    @param beat_sync:
    @return:
    """
    logger.info(
        f"<TaskManager> Sync{mode_sync.title()} || 正在同步<{class_}>任务队列...")

    # ================================================
    # 节拍停顿 原子同步
    # ================================================
    rc = RedisClient()
    _state = _is_overflow(task_name=class_, rc=rc)
    if _state == 'stop':
        return _state

    # ================================================
    # 更新任务信息
    # ================================================
    # 公示即将发动的采集任务数据
    _update_entropy(rc=rc, entropy=__entropy__)
    # 通由工厂读取映射表批量生产采集器运行实体
    sync_queue: list = ActionShunt(class_, silence=True,
                                   beat_sync=beat_sync).shunt()
    # 打乱任务序列
    random.shuffle(sync_queue)

    # ================================================
    # $执行核心业务
    # ================================================
    if mode_sync == 'upload':
        # fixme:临时方案:解决链接溢出问题
        if round(rc.get_len(REDIS_SECRET_KEY.format(class_)) *
                 1.25) > SINGLE_TASK_CAP:
            logger.warning("<TaskManager> UploadHijack -- 连接池任务即将溢出,上传任务被劫持")
            return None
        # 持续实例化采集任务
        for _ in range(sync_queue.__len__()):
            rc.sync_message_queue(mode='upload', message=class_)
            # 节拍同步线程锁
            if only_sync:
                logger.warning("<TaskManager> OnlySync -- 触发节拍同步线程锁,仅上传一枚原子任务")
                break
        logger.success("<TaskManager> UploadTasks -- 任务上传完毕")
    elif mode_sync == 'download':
        async_queue: list = []
        while True:
            # 获取原子任务
            atomic = rc.sync_message_queue(mode='download')
            # 若原子有效则同步数据
            if atomic and atomic in CRAWLER_SEQUENCE:
                # 判断同步状态
                # 防止过载。当本地缓冲任务即将突破容载极限时停止同步
                # _state 状态有三,continue/offload/stop
                _state = _is_overflow(task_name=atomic, rc=rc)
                if _state != 'continue':
                    return _state
                if async_queue.__len__() == 0:
                    async_queue = ActionShunt(atomic,
                                              silence=True,
                                              beat_sync=beat_sync).shunt()
                    random.shuffle(async_queue)
                # 将采集器实体推送至Poseidon本机消息队列
                Middleware.poseidon.put_nowait(async_queue.pop())
                logger.info(
                    f'<TaskManager> offload atomic<{atomic}>({Middleware.poseidon.qsize()})'
                )
                # 节拍同步线程锁
                if only_sync:
                    logger.warning(
                        f"<TaskManager> OnlySync -- <{atomic}>触发节拍同步线程锁,仅下载一枚原子任务"
                    )
                    return 'offload'
            else:
                return 'offload'
    elif mode_sync == 'force_run':
        for slave_ in sync_queue:
            # ================================================================================================
            # TODO v5.4.r 版本新增特性 scaffold spawn
            # 1. 之前版本中通由scaffold 无论运行 run 还是 force-run 指令都无法在队列满载的情况下启动采集任务
            # 主要原因在于如下几行代码加了锁
            # 2. 通过新增的spawn指令可绕过此模块通由SpawnBooster直接编译底层代码启动采集器
            # ================================================================================================
            # force_run :适用于单机部署或单步调试下
            # 需要确保无溢出风险,故即使是force_run的启动模式,任务执行数也不应逾越任务容载数
            _state = _is_overflow(task_name=class_, rc=rc)
            if _state != 'continue':
                return _state

            # 将采集器实体推送至Poseidon本机消息队列
            Middleware.poseidon.put_nowait(slave_)

            # 节拍同步线程锁
            if only_sync:
                logger.warning(
                    f"<TaskManager> OnlySync -- <{class_}>触发节拍同步线程锁,仅下载一枚原子任务")
                return 'stop'

        return 'offload'
Esempio n. 20
0
    def go(self,
           debug: bool = False,
           silence: bool = True,
           power: int = os.cpu_count(),
           use_collector: bool = True,
           use_checker: bool = True,
           identity_recaptcha: bool = False,
           use_generator: bool = False) -> tuple:
        """
        Execute business flow in series.

        collect -> check -> extract -> generate

        :param power:
        :param debug:
        :param silence: Control the headless startup mode of Selenium-Google Search Engine (--headless)
            True: The program starts silently (default). It must be started silently when deployed in Linux.
            False:Display startup, display the operating process of the browser (only used when debugging the program).

        :param use_collector: Whether to use the collector
            True:Start Selenium-GoogleSearchEngine indiscriminate collection_staff-host.
            False: Not enabled
            - If you use this module for the first time,
                be sure to start the collector to complete the original accumulation of staff-host,
                otherwise the checker cannot be started.
            - Since the staff host is not a data that changes frequently,
                after the initial collection,
                logic can be designed to close the collector to improve the operating efficiency of the module

        :param use_checker: Whether to use the checker
            - True: Start the checker to clean the staff-host layer by layer
        and save the output to the corresponding cache directory
            - False: not enabled - It is recommended to open the
        whole process STAFF Checker is used to clean, filter, test, and update the system collection queue. - The
        case of not opening: when debugging the program

        :param identity_recaptcha:

        :param use_generator: Generate system collection queue.

        :return:
        """
        # 启动STAFF采集器
        if use_collector:
            self.collector(silence=silence,
                           debug=debug,
                           page_num=26,
                           sleep_node=5)
        # 启动STAFF检查器
        if use_checker:
            # 进行基本的站点分类任务
            self.checker(business_name="classify_urls",
                         debug=debug,
                         power=power)
        # 识别reCAPTCHA人机验证站点
        if identity_recaptcha:
            # 供检测的站点实体列表
            urls = []
            # 未初始化STAFF检查器
            if self.sc_ is None:
                # STAFF MINING 模块初次运行,无系统缓存
                # 此时无法使用决策器
                if self._cache_files is None:
                    pass
                # 读取上一次STAFF检查器的运行缓存
                else:
                    files = [
                        node for node in self._cache_files
                        if ('general' in node) or ('other' in node)
                    ]
                    for file in files:
                        with open(file, 'r', encoding="utf8") as f:
                            urls += [i for i in f.read().split('\n') if i]
            # 本次go运行启动了STAFF检查器
            # 使用本次STAFF检查器的运行缓存执行决策器
            else:
                urls = self.sc_.queue_staff_arch_pending
            # 若有可供识别的站点实体,则执行决策器,否则输出警告日志
            if urls:
                self.is_recaptcha(urls=set(urls), silence=True)
            else:
                logger.warning("No identifiable site instance.")
        # 启动STAFF生成器
        if use_generator:
            self.generator(urls=[], silence=True)

        return self.extractor()
Esempio n. 21
0
def manage_task(
        class_: str = 'v2ray',
        speedup: bool = True,
        only_sync=False,
        startup=None,
        beat_sync=True,
        force_run=None
) -> bool:
    """
    加载任务
    @param force_run: debug模式下的强制运行,可逃逸队列满载检测
    @param startup:创建协程工作空间,并开始并发执行队列任务。
    @param only_sync:节拍同步线程锁。当本机任务数大于0时,将1枚原子任务推送至Poseidon协程空间。
    @param class_: 任务类型,必须在 crawler seq内,如 ssr,v2ray or trojan。
    @param speedup: 使用加速插件。默认使用coroutine-speedup。
    @param beat_sync:
    @return:
    """

    # ----------------------------------------------------
    # 参数审查与转译
    # ----------------------------------------------------

    # 检查输入
    if class_ not in CRAWLER_SEQUENCE or not isinstance(class_, str):
        return False

    # 审核采集权限,允许越权传参。当手动指定参数时,可授予本机采集权限,否则使用配置权限
    local_work: bool = startup if startup else ENABLE_DEPLOY.get('tasks').get('collector')

    # 强制运行:指定参数优先级更高,若不指定则以是否单机部署模式决定运行force_run是否开启
    # 默认单机模式下开启force_run
    # 若未传参时也未定义部署形式(null),则默认不使用force_run
    force_run = force_run if force_run else SINGLE_DEPLOYMENT

    # ----------------------------------------------------
    # 解析同步模式
    # ----------------------------------------------------
    # 以本机是否有采集权限来区分download 以及upload两种同步模式
    mode_sync = "download" if local_work else "upload"

    # 以更高优先级的`force_run` 替代传统同步模式,执行强制采集方案
    mode_sync = "force_run" if force_run else mode_sync

    # ----------------------------------------------------
    # 同步消息(任务)队列
    # ----------------------------------------------------
    # 当本机可采集时,将任务同步至本机执行,若消息队列为空则
    # 若本机不可采集,则生成任务加入消息队列
    response: str or bool = _sync_actions(
        class_=class_,
        only_sync=only_sync,
        beat_sync=beat_sync,
        mode_sync=mode_sync,
    )

    # ----------------------------------------------------
    # 初始化协程空间(执行任务)
    # ----------------------------------------------------
    # 若本机开启了采集器权限则创建协程空间
    # 若从control-deploy进入此函数,则说明本机必定具备创建协程空间权限
    if force_run:
        if response == 'offload':
            logger.info(f'<TaskManager> ForceRun || <{class_}>采集任务启动')
            vsu(core=PuppetCore(), docker=Middleware.poseidon).run(speedup)
        logger.success(f'<TaskManager> ForceWorkFinish || <{class_}>采集任务结束')
        return True

    # if 'force_run' is False and the node has the permissions of collector
    if local_work:
        # if task queue can be work
        if response == 'offload':
            logger.info(f'<TaskManager> Run || <{class_}>采集任务启动')
            vsu(core=PuppetCore(), docker=Middleware.poseidon).run(speedup)
        logger.success(f'<TaskManager> Finish || <{class_}>采集任务结束')
        return True
    else:
        logger.warning(f"<TaskManager> Hijack<{class_}> || 当前节点不具备采集权限")
        return False
Esempio n. 22
0
def _sync_actions(
    class_: str,
    mode_sync: str = None,
    only_sync=False,
    beat_sync=True,
):
    """

    @param class_:
    @param mode_sync:  是否同步消息队列。False:同步本机任务队列,True:同步Redis订阅任务
    @param only_sync:
    @param beat_sync:
    @return:
    """
    logger.info(
        f"<TaskManager> Sync{mode_sync.title()} || 正在同步<{class_}>任务队列...")

    # TODO 原子化同步行为
    rc = RedisClient()

    # 节拍停顿
    _state = _is_overflow(task_name=class_, rc=rc)
    if _state == 'stop':
        return _state

    sync_queue: list = ActionShunt(class_, silence=True,
                                   beat_sync=beat_sync).shunt()
    random.shuffle(sync_queue)

    # 在本机环境中生成任务并加入消息队列
    if mode_sync == 'upload':

        # fixme:临时方案:解决链接溢出问题
        if round(rc.__len__(REDIS_SECRET_KEY.format(class_)) *
                 1.25) > SINGLE_TASK_CAP:
            logger.warning("<TaskManager> UploadHijack -- 连接池任务即将溢出,上传任务被劫持")
            return None

        # 持续实例化采集任务
        for _ in range(sync_queue.__len__()):

            rc.sync_message_queue(mode='upload', message=class_)

            # 节拍同步线程锁
            if only_sync:
                logger.warning("<TaskManager> OnlySync -- 触发节拍同步线程锁,仅上传一枚原子任务")
                break
        logger.success("<TaskManager> UploadTasks -- 任务上传完毕")

    # 同步分布式消息队列的任务
    elif mode_sync == 'download':
        async_queue: list = []

        while True:

            # 获取原子任务
            atomic = rc.sync_message_queue(mode='download')

            # 若原子有效则同步数据
            if atomic and atomic in CRAWLER_SEQUENCE:

                # 判断同步状态
                # 防止过载。当本地缓冲任务即将突破容载极限时停止同步
                # _state 状态有三,continue/offload/stop
                _state = _is_overflow(task_name=atomic, rc=rc)
                if _state != 'continue':
                    return _state

                if async_queue.__len__() == 0:
                    async_queue = ActionShunt(atomic,
                                              silence=True,
                                              beat_sync=beat_sync).shunt()
                    random.shuffle(async_queue)

                # 将执行语句推送至Poseidon本机消息队列
                Middleware.poseidon.put_nowait(async_queue.pop())

                logger.info(
                    f'<TaskManager> offload atomic<{atomic}>({Middleware.poseidon.qsize()})'
                )

                # 节拍同步线程锁
                if only_sync:
                    logger.warning(
                        f"<TaskManager> OnlySync -- <{atomic}>触发节拍同步线程锁,仅下载一枚原子任务"
                    )
                    return 'offload'

            # 否则打印警告日志并提前退出同步
            else:
                # logger.warning(f"<TaskManager> SyncFinish -- <{atomic}>无可同步任务")
                return 'offload'

    elif mode_sync == 'force_run':
        for slave_ in sync_queue:

            # force_run :适用于单机部署或单步调试下
            # 需要确保无溢出风险,故即使是force_run的启动模式,任务执行数也不应逾越任务容载数
            _state = _is_overflow(task_name=class_, rc=rc)
            if _state != 'continue':
                return _state

            # 将执行语句推送至Poseidon本机消息队列
            Middleware.poseidon.put_nowait(slave_)

            # 在force_run模式下仍制约于节拍同步线程锁
            # 此举服务于主机的订阅补充操作
            # 优先级更高,不受队列可用容载影响强制中断同步操作
            if only_sync:
                logger.warning(
                    f"<TaskManager> OnlySync -- <{class_}>触发节拍同步线程锁,仅下载一枚原子任务")
                return 'stop'

        return 'offload'
Esempio n. 23
0
def _sync_actions(
        class_: str,
        mode_sync: str = None,
        only_sync=False,
        beat_sync=True,
):
    """

    @param class_:
    @param mode_sync:  是否同步消息队列。False:同步本机任务队列,True:同步Redis订阅任务
    @param only_sync:
    @param beat_sync:
    @return:
    """
    logger.info(f"<TaskManager> Sync{mode_sync.title()} || 正在同步<{class_}>任务队列...")

    # TODO 原子化同步行为
    rc = RedisClient()

    # 拷贝生成队列,需使用copy()完成拷贝,否则pop()会影响actions-list本体
    # [A-Cloud,B-Cloud, ...]
    task_list: list = actions.__all__.copy()
    random.shuffle(task_list)

    # 在本机环境中生成任务并加入消息队列
    if mode_sync == 'upload':

        # 临时方案,解决链接溢出问题
        if round(rc.__len__(REDIS_SECRET_KEY.format(class_)) * 1.25) > SINGLE_TASK_CAP:
            logger.warning("<TaskManager> UploadHijack -- 连接池任务已溢出,上传任务被劫持")
            return None

        # 持续实例化采集任务
        while True:
            if task_list.__len__() == 0:
                logger.success("<TaskManager> EmptyList -- 本机任务为空或已完全生成")
                break
            else:
                slave_ = task_list.pop()

                # 将相应的任务执行语句转换成exec语法
                expr = f'from src.BusinessLogicLayer.cluster.slavers.actions import {slave_}\n' \
                       f'{slave_}(beat_sync={beat_sync}).run()'

                # 将执行语句同步至消息队列
                rc.sync_message_queue(mode='upload', message=expr)

                # 节拍同步线程锁
                if only_sync:
                    logger.warning("<TaskManager> OnlySync -- 触发节拍同步线程锁,仅上传一枚原子任务")
                    break

        logger.info(f"<TaskManager> 本节点任务({actions.__all__.__len__()})已同步至消息队列,"
                    f"待集群接收订阅后既可完成后续任务")

    # 同步分布式消息队列的任务
    elif mode_sync == 'download':
        while True:

            # 判断同步状态
            # 防止过载。当本地缓冲任务即将突破容载极限时停止同步
            # _state 状态有三,continue/offload/stop
            _state = _is_overflow(task_name=class_, rc=rc)
            if _state != 'continue':
                return _state

            # 获取原子任务,该任务应已封装为exec语法
            # todo 将入队操作封装到redis里,以获得合理的循环退出条件
            atomic = rc.sync_message_queue(mode='download')

            # 若原子有效则同步数据
            if atomic:
                # 将执行语句推送至Poseidon本机消息队列
                Middleware.poseidon.put_nowait(atomic)
                logger.info(f'<TaskManager> offload atomic<{class_}>')

                # 节拍同步线程锁
                if only_sync:
                    logger.warning(f"<TaskManager> OnlySync -- <{class_}>触发节拍同步线程锁,仅下载一枚原子任务")
                    return 'offload'

            # 否则打印警告日志并提前退出同步
            else:
                logger.warning(f"<TaskManager> SyncFinish -- <{class_}>无可同步任务")
                break

    elif mode_sync == 'force_run':
        for slave_ in task_list:

            # force_run :适用于单机部署或单步调试下
            _state = _is_overflow(task_name=class_, rc=rc)

            # 需要确保无溢出风险,故即使是force_run的启动模式,任务执行数也不应逾越任务容载数
            if _state == 'stop':
                return 'stop'

            # 将相应的任务执行语句转换成exec语法
            expr = f'from src.BusinessLogicLayer.cluster.slavers.actions import {slave_}\n' \
                   f'{slave_}(beat_sync={beat_sync}).run()'

            # 将执行语句推送至Poseidon本机消息队列
            Middleware.poseidon.put_nowait(expr)

            # 在force_run模式下仍制约于节拍同步线程锁
            # 此举服务于主机的订阅补充操作
            # 优先级更高,不受队列可用容载影响强制中断同步操作
            if only_sync:
                logger.warning(f"<TaskManager> OnlySync -- <{class_}>触发节拍同步线程锁,仅下载一枚原子任务")
                return 'stop'
        else:
            logger.success(f"<TaskManager> ForceCollect"
                           f" -- 已将本地预设任务({actions.__all__.__len__()})录入待执行队列")
            return 'offload'
Esempio n. 24
0
    def startup(self, driver_command_set: List[str]):
        """
        仅支持单进程使用
        @param driver_command_set: 在空指令时列表仅有1个元素,表示启动路径
        @return:
        """
        # logger.info(f">>> {' '.join(driver_command_set)}")

        # -------------------------------
        # TODO 优先级0:预处理指令集
        # -------------------------------
        # CommandId or List[CommandId]
        driver_command: List[str] = []

        # 未输入任何指令 列出脚手架简介
        if len(driver_command_set) == 1:
            print("\n".join([
                f">>> {menu[0].ljust(20, '-')}|| {menu[-1]}"
                for menu in command_set.items()
            ]))
            return True
        # 输入立即指令 转译指令
        if len(driver_command_set) == 2:
            driver_command = [
                driver_command_set[-1].lower(),
            ]
        # 输入指令集 转译指令集
        elif len(driver_command_set) > 2:
            driver_command = list(
                {command.lower()
                 for command in driver_command_set[1:]})

        # 捕获意料之外的情况
        if not isinstance(driver_command, list):
            return True
        # -------------------------------
        # TODO 优先级1:解析运行参数
        # -------------------------------

        # TODO --help 帮助菜单(继续完善相关功能)
        # 使用该参数时系统不解析运行指令
        if '--help' in driver_command:
            logger.info(">>>GuiderHelp || 帮助菜单")
            driver_command.remove("--help")
            for command_ in driver_command:
                introduction = command_set.get(command_)
                if introduction:
                    print(f"> {command_.ljust(20, '-')}|| {introduction}")
                else:
                    print(f"> {command_}指令不存在")
            return True

        # 智能采集 解析目标
        if '--parse' in driver_command:
            driver_command.remove('--parse')
            task_list = []
            for url_ in reversed(driver_command):
                if url_.startswith("http") or url_.startswith(
                        "ssr") or url_.startswith("vmess"):
                    task_list.append(
                        gevent.spawn(self._scaffold_parse, url=url_))
            gevent.joinall(task_list)
            return True

        # 清除系统缓存
        if 'clear' in driver_command:
            driver_command.remove('clear')
            self._scaffold_clear()
            return True
        # -------------------------------
        # TODO 优先级2:运行单线程指令
        # -------------------------------

        # 协程任务队列
        task_list = []

        # 测试数据库连接
        while driver_command.__len__() > 0:
            _pending_command = driver_command.pop()
            try:
                task_list.append(
                    gevent.spawn(self.command2solution[_pending_command]))
            except KeyError as e:
                logger.warning(f'脚手架暂未授权指令<{_pending_command}> {e}')

        # 并发执行以上指令
        gevent.joinall(task_list)

        # -------------------------------
        # TODO 优先级3:自定义参数部署(阻塞线程)
        # -------------------------------
        if 'deploy' in driver_command:
            self._scaffold_deploy()