def startup() -> None: process_list = [] try: # 部署<单进程多线程>定时任务 if ENABLE_DEPLOY['global']: process_list.append( multiprocessing.Process(target=_SystemEngine.run_deploy, name='deploymentTimingTask')) # 部署flask if ENABLE_SERVER: process_list.append( multiprocessing.Process(target=_SystemEngine.run_server, name='deploymentFlaskAPI')) # 执行多进程任务 for process_ in process_list: logger.success(f'<SystemProcess> Startup -- {process_.name}') process_.start() # 添加阻塞 for process_ in process_list: process_.join() except (TypeError, AttributeError): pass except (KeyboardInterrupt, SystemExit): # FIXME 确保进程间不产生通信的情况下终止 logger.debug('<SystemProcess> Received keyboard interrupt signal.') for process_ in process_list: process_.terminate() finally: logger.success('<SystemProcess> The system exits completely.')
def __init__(self): super(RedisDataDisasterTolerance, self).__init__() from src.BusinessCentralLayer.setting import REDIS_SLAVER_DDT if not REDIS_SLAVER_DDT.get('host'): logger.warning('未设置数据容灾服务器,该职能将由Master执行') # 拷贝参数 redis_virtual = REDIS_MASTER # 改动浅拷贝数据库 redis_virtual.update({'db': redis_virtual['db'] + 1}) logger.debug("备份重定向 --> {}".format(redis_virtual)) else: redis_virtual = REDIS_SLAVER_DDT # 容器初始化 self.docker = {} try: self.acm = RedisClient(host=redis_virtual['host'], port=redis_virtual['port'], password=redis_virtual['password']) logger.info("DDT: Master({}) -> Slaver({})".format( REDIS_MASTER['host'], redis_virtual['host'])) except redis.exceptions.ConnectionError as e: logger.exception(e) finally: self.redis_virtual = redis_virtual
def startup() -> None: process_list = [] try: # 部署<单进程多线程>定时任务 if ENABLE_DEPLOY['global']: process_list.append( multiprocessing.Process(target=_SystemEngine.run_deploy, name='deploymentTimingTask')) # 部署flask if ENABLE_SERVER: process_list.append( multiprocessing.Process(target=_SystemEngine.run_server, name='deploymentFlaskAPI')) # 执行多进程任务 for process_ in process_list: logger.success(f'<SystemProcess> Startup -- {process_.name}') process_.start() # 添加阻塞 for process_ in process_list: process_.join() except TypeError or AttributeError as e: logger.exception(e) send_email(f"[程序异常终止]{str(e)}", to_='self') except KeyboardInterrupt: # FIXME 确保进程间不产生通信的情况下终止 logger.debug('<SystemProcess> Received keyboard interrupt signal') for process_ in process_list: process_.terminate() finally: logger.success('<SystemProcess> End the V2RayCloudSpider')
def detach(subscribe, beat_sync=False): """ @param subscribe: @param beat_sync: 是否立即删除, True:立即删除,False:节拍同步,随ddt删除 @return: """ from faker import Faker from urllib.parse import urlparse # 清洗出订阅中的token token = urlparse(subscribe).path r = RedisClient().get_driver() # 遍历所有任务类型 for task in CRAWLER_SEQUENCE: # 遍历某种类型的链接池 for sub in r.hgetall(REDIS_SECRET_KEY.format(task)).items(): # 匹配用户token if token == urlparse(sub[0]).path: # 若节拍同步,立即移除订阅 if beat_sync: r.hdel(REDIS_SECRET_KEY.format(task), sub[0]) logger.debug(f'>> Detach -> {sub[0]}') # 否则将订阅过期时间标记为过期,该链接将随下一波任一节点的ddt任务被删除 else: r.hset(REDIS_SECRET_KEY.format(task), sub[0], str(Faker().past_datetime())) break
def sign_up(self, api: Chrome): """ 注册行为 @param api: @return: """ # 任务超时则弹出协程句柄 终止任务进行 while True: # ====================================== # 紧急制动,本次行为释放宣告失败,拉闸撤退!! # ====================================== # 若任务超时 主动抛出异常 if self._is_timeout(): raise TimeoutException # ====================================== # 填充注册数据 # ====================================== time.sleep(0.5) WebDriverWait(api, 20) \ .until(expected_conditions.presence_of_element_located((By.ID, 'name'))) \ .send_keys(self.username) api.find_element_by_id('email').send_keys(self.email) api.find_element_by_id('passwd').send_keys(self.password) api.find_element_by_id('repasswd').send_keys(self.password) time.sleep(1) # ====================================== # 依据实体抽象特征,选择相应的解决方案 # ====================================== # 滑动验证 TODO 引入STAFF API 自适应识别参数 if self.anti_slider: # 打开工具箱 response = self.utils_slider(api=api) # 执行失败刷新页面并重试N次 if not response: self.work_clock_utils = time.time() api.refresh() continue # ====================================== # 提交注册数据,完成注册任务 # ====================================== # 点击注册按键 api.find_element_by_id('register-confirm').click() # 重试N轮 等待[注册成功]界面的加载 for x in range(3): try: time.sleep(1.5) api.find_element_by_xpath("//button[contains(@class,'confirm')]").click() return True except NoSuchElementException: logger.debug(f'[{x + 1} / 3]{self.action_name}验证超时,{self.timeout_retry_time}s后重试') time.sleep(self.timeout_retry_time) continue
def run(self, api=None): logger.debug( f">> RUN <{self.action_name}> --> beat_sync[{self.beat_sync}] feature[General]" ) # 获取任务设置 api = self.set_spider_option() if api is None else api # 执行核心业务逻辑 try: # 设置弹性计时器,当目标站点未能在规定时间内渲染到预期范围时自主销毁实体 # 防止云彩姬误陷“战局”被站长过肩摔 self.get_html_handle(api=api, url=self.register_url, wait_seconds=45) # 注册账号 self.sign_up(api) # 进入站点并等待核心元素渲染完成 self.wait(api, 40, "//div[@class='card-body']") # 根据原子类型订阅的优先顺序 依次捕获 self.capture_subscribe(api) except TimeoutException: logger.error( f'>>> TimeoutException <{self.action_name}> -- {self.register_url}' ) except WebDriverException as e: logger.error(f">>> WebDriverException <{self.action_name}> -- {e}") except (HTTPError, ConnectionRefusedError, ConnectionResetError): pass except Exception as e: logger.warning(f">>> Exception <{self.action_name}> -- {e}") finally: api.quit()
def select_subs_to_admin(select_netloc: str = None, _debug=False) -> dict: # 池内所有类型订阅 remain_subs = [] # 订阅池状态映射表 mapping_subs_status = {} # 链接-类型映射表 mapping_subs_type = {} # 清洗数据 for filed in CRAWLER_SEQUENCE: # 提取池内对应类型的所有订阅链接 filed_sbus: list = RedisClient().sync_remain_subs( REDIS_SECRET_KEY.format(filed)) # 更新汇总队列 remain_subs += filed_sbus # 提取subs netloc映射区间 urls = [urlparse(i[0]).netloc for i in filed_sbus] # 更新映射表 mapping_subs_status.update({filed: dict(Counter(urls))}) mapping_subs_type.update( zip([i[0] for i in filed_sbus], [ filed, ] * len(filed_sbus))) # 初始化状态下,返回订阅池状态 if not select_netloc: return {'msg': 'success', 'info': mapping_subs_status} # 指定netloc状态下,分发对应netloc的subscribe else: for tag in remain_subs: # 提取信息键 subscribe, end_life = tag[0], tag[-1] # 存在对应netloc的链接并可存活至少beyond小时 if select_netloc in urlparse(subscribe).netloc and not RedisClient( ).is_stale(end_life, beyond=6): logger.debug("<SuperAdmin> -- 获取订阅") try: return { 'msg': "success", 'debug': _debug, 'info': { "subscribe": subscribe, "endLife": end_life, 'subsType': mapping_subs_type[subscribe], "netloc": select_netloc } } finally: if not _debug: threading.Thread(target=detach, kwargs={ "subscribe": subscribe, 'beat_sync': True }).start() # 无库存或误码 return { 'msg': "failed", "netloc": select_netloc, "info": "指令错误或不存在该类型订阅", "status": mapping_subs_status }
def pop_subs_to_admin(class_: str): """ @param class_: @return: """ logger.debug("<SuperAdmin> -- 获取订阅") from src.BusinessLogicLayer.cluster.sailor import manage_task try: # 获取该类型订阅剩余链接 remain_subs: list = RedisClient().sync_remain_subs( REDIS_SECRET_KEY.format(class_)) while True: # 若无可用链接则返回错误信息 if remain_subs.__len__() == 0: logger.error(f'<SuperAdmin> -- 无可用<{class_}>订阅') return {'msg': 'failed', 'info': f"无可用<{class_}>订阅"} else: # 从池中获取(最新加入的)订阅s-e subs, end_life = remain_subs.pop() # 将s-e加入缓冲队列,该队列将被ddt的refresh工作流同过期链接一同删除 # 使用缓冲队列的方案保证节拍同步,防止过热操作/失误操作贯穿Redis # 既当管理员通过此接口获取链接时,被返回的链接不会直接从池中删去 # 而是触发缓冲机制,既将该链接标记后加入apollo缓冲队列 # apollo队列内的元素都是欲删除缓存,当ddt发动后会一次性情况当前所有的缓存 # 对订阅进行质量粗检 # if subs2node(subs=subs, cache_path=False, timeout=2)['node'].__len__() <= 3: # logger.debug(f"<check> BadLink -- {subs}") # continue # 使用节拍同步线程锁发起连接池回滚指令,仅生成/同步一枚原子任务 threading.Thread(target=manage_task, kwargs={ "class_": class_, "only_sync": True }).start() logger.success('管理员模式--链接分发成功') # 立即执行链接解耦,将同一账号的所有订阅移除 # beat_sync =True立即刷新,False延迟刷新(节拍同步) threading.Thread(target=detach, kwargs={ "subscribe": subs, 'beat_sync': True }).start() return { 'msg': 'success', 'subscribe': subs, 'subsType': class_ } except Exception as e: logger.exception(e) return {'msg': 'failed', 'info': str(e)}
def sign_up(self, api, retry_=0, max_retry_num_=4): """ @param api: @param retry_: @param max_retry_num_: @return: """ if retry_ > max_retry_num_: return False from src.BusinessLogicLayer.plugins.defensive_counter import validation_interface WebDriverWait(api, 15) \ .until(expected_conditions.presence_of_element_located((By.ID, 'name'))) \ .send_keys(self.username) api.find_element_by_id('email').send_keys(self.email) api.find_element_by_id('passwd').send_keys(self.password) api.find_element_by_id('repasswd').send_keys(self.password) time.sleep(0.5) # 滑动验证 def spider_module(retry=0, max_retry_num=2): if retry > max_retry_num: return False try: full_bg_path = join(SERVER_DIR_CACHE_BGPIC, f'fbg_{self.action_name}.png') bg_path = join(SERVER_DIR_CACHE_BGPIC, f'bg_{self.action_name}.png') response = validation_interface(api, methods='slider', full_bg_path=full_bg_path, bg_path=bg_path) return response except NoSuchElementException: retry += 1 spider_module(retry) if self.anti_slider: if not spider_module(): api.refresh() return self.sign_up(api) api.find_element_by_id('register-confirm').click() for x in range(max_retry_num_): try: time.sleep(1.5) api.find_element_by_xpath( "//button[contains(@class,'confirm')]").click() break except NoSuchElementException: logger.debug('{}验证超时,3s 后重试'.format(self.action_name)) time.sleep(3)
def sign_up(self, api): """ @param api: @return: """ # ====================================== # 紧急制动,本次行为释放宣告失败,拉闸撤退!! # ====================================== if self._is_timeout(): raise TimeoutException # ====================================== # 填充注册数据 # ====================================== WebDriverWait(api, 15) \ .until(expected_conditions.presence_of_element_located((By.ID, 'name'))) \ .send_keys(self.username) api.find_element_by_id('email').send_keys(self.email) api.find_element_by_id('passwd').send_keys(self.password) api.find_element_by_id('repasswd').send_keys(self.password) time.sleep(0.5) # ====================================== # 依据实体抽象特征,选择相应的解决方案 # ====================================== # 滑动验证 if self.anti_slider: # 打开工具箱 response = self._utils_slider(api=api) # 执行失败刷新页面并重试N次 if not response: self.work_clock_utils = time.time() api.refresh() return self.sign_up(api) # ====================================== # 提交注册数据,完成注册任务 # ====================================== api.find_element_by_id('register-confirm').click() for x in range(3): try: time.sleep(1.5) api.find_element_by_xpath("//button[contains(@class,'confirm')]").click() return True except NoSuchElementException: logger.debug('{}验证超时,3s 后重试'.format(self.action_name)) time.sleep(3) continue raise TimeoutException
def killer(self): """ @todo redis批量移除或移动hash @return: """ if self.apollo: for kill_ in self.apollo: self.rc.hdel(kill_[0], kill_[-1]) logger.debug(f'>> Detach -> {kill_[-1]}')
def load_any_subscribe(self, api: Chrome, element_xpath_str: str, href_xpath_str: str, class_: str, retry=0): """ 捕获订阅并送入持久化数据池 @param api: ChromeDriverObject @param element_xpath_str: 用于定位链接所在的标签 @param href_xpath_str: 用于取出目标标签的属性值,既subscribe @param class_: 该subscribe类型,如`ssr`/`v2ray`/`trojan` @param retry: 失败重试 @todo 使用 retrying 模块替代retry参数实现的功能(引入断网重连,断言重试,行为回滚...) @return: """ self.subscribe = WebDriverWait(api, 30).until(expected_conditions.presence_of_element_located(( By.XPATH, element_xpath_str ))).get_attribute(href_xpath_str) # 若对象可捕捉则解析数据并持久化数据 if self.subscribe: # 失败重试3次 for x in range(3): # ['domain', 'subs', 'class_', 'end_life', 'res_time', 'passable','username', 'password', 'email'] try: # 机场域名 domain = urlparse(self.register_url).netloc # 采集时间 res_time = str(datetime.now(TIME_ZONE_CN)).split('.')[0] # 链接可用,默认为true passable = 'true' # 信息键 docker = [domain, self.subscribe, class_, self.end_life, res_time, passable, self.username, self.password, self.email] # 根据不同的beat_sync形式持久化数据 FlexibleDistribute(docker=docker, beat_sync=self.beat_sync) # 数据存储成功后结束循环 logger.success(">> GET <{}> -> {}:{}".format(self.action_name, class_, self.subscribe)) # TODO ADD v5.1.0更新特性,记录机场域名-订阅域名映射缓存 # set_task2url_cache(task_name=self.__class__.__name__, register_url=self.register_url, # subs=self.subscribe) break except Exception as e: logger.debug(">> FAILED <{}> -> {}:{}".format(self.action_name, class_, e)) time.sleep(1) continue # 若没有成功存储,返回None else: return None # 否则调入健壮工程 # TODO 判断异常原因,构建解决方案,若无可靠解决方案,确保该任务安全退出 else: if retry >= 3: raise TimeoutException retry += 1 self.load_any_subscribe(api, element_xpath_str, href_xpath_str, class_, retry)
def run(self): try: if [cq for cq in reversed(self.root) if not os.path.exists(cq)]: logger.warning('系统文件残缺!') logger.debug("启动<工程重构>模块...") self.set_up_file_tree(self.root) self.check_config() finally: if self.flag: logger.success(">>> 运行环境链接完成,请重启项目") logger.warning(">>> 提醒您正确配置Chrome及对应版本的ChromeDriver") sys.exit()
def control_driver(self, sub_info: List[str]): """ @param sub_info: [subs,key_secret_class] @return: """ try: # 解耦指定簇 if self.kill_ and self.kill_ in sub_info[0]: self._del_subs(sub_info[-1], sub_info[0], "target") else: # 解析订阅 node_info: dict = subs2node(sub_info[0], False) # 打印debug信息 if self.debug: print( f"check -- {node_info['subs']} -- {node_info['node'].__len__()}" ) # 订阅解耦 if node_info['node'].__len__() <= 4: self._del_subs(sub_info[-1], sub_info[0], "decouple") except UnicodeDecodeError or TypeError as e: logger.debug( f"Retry put the subscribe({sub_info}) to work queue -- {e}") # 单个链接重试3次,标记超时链接 if self.temp_cache.get(sub_info[0]): self.temp_cache[sub_info[0]] += 1 else: self.temp_cache[sub_info[0]] = 1 if self.temp_cache[sub_info[0]] <= 3: self.work_q.put_nowait(sub_info) else: self._del_subs(sub_info[-1], sub_info[0], e) except SystemExit: logger.critical("请关闭系统代理后再执行订阅清洗操作") except Exception as e: logger.warning(f"{sub_info} -- {e}") self._del_subs(sub_info[-1], sub_info[0])
def refresh(self, key_name: str, cross_threshold: int = None) -> None: """ 原子级链接池刷新,一次性删去所有过期的key_name subscribe @param cross_threshold: 越过阈值删除订阅 @param key_name:secret_key @return: """ docker: dict = self.db.hgetall(key_name) # 管理员指令获取的链接 if self.get_len(key_name) != 0: for subscribe, end_life in docker.items(): if self.is_stale(end_life, cross_threshold): logger.debug(f'del-({key_name})--{subscribe}') self.db.hdel(key_name, subscribe) logger.success('<{}> UPDATE - {}({})'.format( self.__class__.__name__, key_name, self.get_len(key_name))) else: logger.warning('<{}> EMPTY - {}({})'.format( self.__class__.__name__, key_name, self.get_len(key_name)))
def run(self, api=None): logger.debug( f">> RUN <{self.action_name}> --> beat_sync[{self.beat_sync}] feature[{self.atomic.get('feature')}]") api = self.set_spider_option() if api is None else api try: self.get_html_handle(api, self.register_url, 60) self.sign_up(api) # 已进入/usr界面 尽最大努力加载页面中的js元素 self.wait(api, 40, 'all') # 点击商城转换页面 try: self.wait(api, 10, self.xpath_page_shop) api.find_element_by_xpath(self.xpath_page_shop).click() # 弹窗遮盖 except ElementClickInterceptedException: time.sleep(0.5) api.find_element_by_xpath("//button").click() time.sleep(0.5) # 点击商城转换页面至/shop界面,again self.wait(api, 10, self.xpath_page_shop) api.find_element_by_xpath(self.xpath_page_shop).click() # 免费计划识别 购买免费计划 try: self.wait(api, 3, self.xpath_button_buy) api.find_element_by_xpath(self.xpath_button_buy).click() # 回到主页 time.sleep(1) api.get(self.register_url) # 获取链接 time.sleep(1) self.wait(api, 40, self.xpath_canvas_subs) self.capture_subscribe(api) except TimeoutException: return False finally: api.quit()
def _scaffold_exile(task_sequential=4): logger.debug( f"<ScaffoldGuider> Exile[0/{task_sequential}] || Running scaffold exile..." ) time.sleep(0.3) # task1: 检查队列任务 logger.debug( f"<ScaffoldGuider> Exile[1/{task_sequential}] || Checking the task queue..." ) time.sleep(0.3) _ScaffoldGuider._scaffold_entropy(_debug=True) # logger.success(f">>> [Mission Completed] || entropy") # task2: decouple logger.debug( f"<ScaffoldGuider> Exile[2/{task_sequential}] || Cleaning the subscribe pool..." ) time.sleep(0.3) _ScaffoldGuider._scaffold_decouple() # logger.success(f">>> [Mission Completed] || decouple") # task3: overdue logger.debug( f"<ScaffoldGuider> Exile[3/{task_sequential}] || Cleaning timed out subscribes..." ) time.sleep(0.3) _ScaffoldGuider._scaffold_overdue() # logger.success(">>> [Mission Completed] || overdue") # finally: print task-queue, remaining subscribes logger.debug( f"<ScaffoldGuider> Exile[{task_sequential}/{task_sequential}] || Outputting debug data..." ) _ScaffoldGuider._scaffold_entropy() _ScaffoldGuider._scaffold_remain() logger.success("<ScaffoldGuider> Exile[Mission Completed] || exile")
def _is_overflow(task_name: str, rc=None): """ 判断当前缓存是否已达单机采集极限 @param task_name: class_ @param rc: RedisClient Object Driver API @return: --stop: 停止任务同步并结束本轮采集任务 --offload:停止任务同步并开始执行采集任务 --continue:继续同步任务 """ # TODO 将缓存操作原子化 cap: int = SINGLE_TASK_CAP # 获取当前仓库剩余 storage_remain: int = rc.__len__(REDIS_SECRET_KEY.format(f'{task_name}')) # 获取本机任务缓存 cache_size: int = Middleware.poseidon.qsize() # 判断任务队列是否达到满载状态或已溢出 if storage_remain >= cap: logger.warning(f'<TaskManager> OverFlow || 任务溢出<{task_name}>({storage_remain}/{cap})') return 'stop' # 判断缓冲队列是否已达单机采集极限 # 未防止绝对溢出,此处限制单机任务数不可超过满载值的~x% # x = 1 if signal collector else x = 1/sum (Number of processes) elif storage_remain + cache_size >= round(cap * 0.8): # 若已达或超过单机采集极限,则休眠任务 logger.debug(f'<TaskManager> BeatPause || 节拍停顿<{task_name}>({storage_remain + cache_size}/{cap})') return 'offload' # 否则可以继续同步任务 else: return 'continue'
def _del_subs(self, key_: str, subs: str, err_: str = '') -> None: self.rc.hdel(key_, subs) logger.debug(f'>> Detach -> {subs} -- {err_}')
def _debug_printer(self, msg: str) -> None: if self._debug: logger.debug(f"<ClashAdapter> | {msg}")