def save_backup(self, doc_source: str, doc_source_name: str, doc_name: str) -> bool: """保存文件备份状态 Args: doc_source (str): 文章获取源 doc_source_name (str): 文章源 doc_name (str): 文章名字 Returns: bool: 是否成功 """ file_msg = f"{doc_source}/{doc_source_name}/{doc_name}" try: filter_dict = { "backup_type": self.backup_type, "doc_source": doc_source, "doc_source_name": doc_source_name, "doc_name": doc_name, } update_data = {"$set": {**filter_dict, **{"ts": int(time.time())}}} self.bak_coll.update_one( filter=filter_dict, update=update_data, upsert=True ) LOGGER.info(f"Backup({self.backup_type}): 文章 {file_msg} 状态保存成功!") except Exception as e: LOGGER.error(f"Backup({self.backup_type}): 文章 {file_msg} 状态保存失败!{e}")
def send(self, send_data) -> bool: """ 下发到Bark终端 :param send_data: 下发内容字典,字段开发者自定义 :return: """ doc_name = send_data["doc_name"] # doc_source = send_data["doc_source"] doc_link = send_data["doc_link"] doc_cus_des = send_data["doc_cus_des"] doc_source_name = send_data["doc_source_name"] doc_id = send_data["doc_id"] is_send = self.is_send(doc_id=doc_id) send_status = True notice_msg = f"{doc_cus_des}👉{doc_source_name}_{doc_name}:{doc_link} 分发到 {self.send_type}" if not is_send: url = self.compose(send_data) resp = requests.post(url) if resp.status_code == 200 and json.loads( resp.text)["code"] == 200: # 将状态持久化到数据库 self.sl_coll.insert_one({ "send_type": self.send_type, "doc_id": doc_id, "ts": int(time.time()), }) # 下发成功 LOGGER.info(f"{notice_msg} 成功!") else: errmsg = json.loads(resp.text)["code"] LOGGER.error(f"{notice_msg} 失败:{errmsg}") send_status = False return send_status
def delete_backup( self, doc_source: str, doc_source_name: str, doc_name: str ) -> bool: """删除文件分备份状态 Args: doc_source (str): 文章获取源 doc_source_name (str): 文章源 doc_name (str): 文章名字 Returns: bool: 是否成功 """ file_msg = f"{doc_source}/{doc_source_name}/{doc_name}" try: self.bak_coll.delete_one( { "backup_type": self.backup_type, "doc_source": doc_source, "doc_source_name": doc_source_name, "doc_name": doc_name, } ) LOGGER.info(f"Backup({self.backup_type}): 文章 {file_msg} 状态删除成功!") except Exception as e: LOGGER.error(f"Backup({self.backup_type}): 文章 {file_msg} 状态删除失败!{e}")
def create_app(): """ 建立web应用 url: http://flask.pocoo.org/docs/1.0/quickstart/ :return: """ flask_app = Flask(__name__) with flask_app.app_context(): # 项目内部配置 mongodb_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) app_loop = asyncio.get_event_loop() flask_app.config["app_config"] = Config flask_app.config["app_logger"] = LOGGER flask_app.config["app_loop"] = app_loop flask_app.config["mongodb_base"] = mongodb_base # 每次启动先保证公众号名称爬虫运行成功 # spider = run_wechat_name_spider(loop=app_loop) # if spider.success_counts == 1: # # 爬虫运行成功 # LOGGER.info("Wechat spider started successfully :)") LOGGER.info("API started successfully :)") flask_app.register_blueprint(bp_api) return flask_app
def delete(self, doc_source: str, doc_source_name: str, doc_name: str) -> bool: """删除某个文件 Args: doc_source (str): 文章获取源 doc_source_name (str): 文章源 doc_name (str): 文章名字 Returns: bool: 是否成功 """ file_path = f"{doc_source}/{doc_source_name}/{doc_name}.html" db_res = mongodb_delete_many_data( coll_conn=self.liuli_backup_coll, filter_dict={ "doc_source": doc_source, "doc_source_name": doc_source_name, "doc_name": doc_name, }, ) op_res = True if db_res["status"]: LOGGER.info(f"Backup({self.backup_type}): {file_path} 删除成功!") # 删除当前文章状态 self.delete_backup( doc_source=doc_source, doc_source_name=doc_source_name, doc_name=doc_name, ) else: op_res = False LOGGER.error( f"Backup({self.backup_type}): {file_path} 删除失败!{db_res['info']}" ) return op_res
def run_liuli_schedule(ll_config_name: str = "default"): """调度启动函数 Args: task_config (dict): 调度任务配置 """ ll_config_path = os.path.join(Config.LL_CONFIG_DIR, f"{ll_config_name}.json") with open(ll_config_path, "r", encoding="utf-8") as load_f: ll_config = json.load(load_f) schdule_time_list = ll_config["schedule"].get("period_list", ["00:10", "12:10", "21:10"]) for each in schdule_time_list: schedule.every().day.at(each).do(run_liuli_task, ll_config) name: str = ll_config["name"] author: str = ll_config["author"] start_info = f"Schedule({Config.SCHEDULE_VERSION}) task({name}@{author}) started successfully :)" LOGGER.info(start_info) schdule_msg = f"Task({name}@{author}) schedule time:\n " + "\n ".join( schdule_time_list) LOGGER.info(schdule_msg) # 启动就执行一次 run_liuli_task(ll_config) while True: schedule.run_pending() time.sleep(1)
def delete(self, doc_source: str, doc_source_name: str, doc_name: str) -> bool: """删除某个文件 Args: doc_source (str): 文章获取源 doc_source_name (str): 文章源 doc_name (str): 文章名字 Returns: bool: 是否成功 """ file_path = f"{doc_source}/{doc_source_name}/{doc_name}.html" op_res = True try: contents = self.repo.get_contents(file_path) _ = self.repo.delete_file(contents.path, f"Remove {file_path}", contents.sha) LOGGER.info(f"Backup({self.backup_type}): {file_path} 删除成功!") # 删除当前文章状态 self.delete_backup( doc_source=doc_source, doc_source_name=doc_source_name, doc_name=doc_name, ) except Exception as e: op_res = False LOGGER.error(f"Backup({self.backup_type}): {file_path} 删除失败!{e}") return op_res
def save(self, backup_data: dict) -> bool: """执行备份动作 Args: backup_data (dict): 备份数据 Returns: bool: 是否成功 """ # 以下字段必须存在 doc_source = backup_data["doc_source"] doc_source_name = backup_data["doc_source_name"] doc_name = backup_data["doc_name"] # 源文件 doc_html = backup_data["doc_html"] file_msg = f"{doc_source}/{doc_source_name}/{doc_name}" file_path = f"{file_msg}.html" is_backup = self.is_backup( doc_source=doc_source, doc_source_name=doc_source_name, doc_name=doc_name, ) # 未备份过或者强制备份下将继续执行 if not is_backup or self.force_backup: filter_dict = { "doc_source": doc_source, "doc_source_name": doc_source_name, "doc_name": doc_name, } update_data = { "$set": { **filter_dict, **{"ts": int(time.time()), "content": text_compress(doc_html)}, } } db_update_res = mongodb_update_data( coll_conn=self.liuli_backup_coll, filter_dict=filter_dict, update_data=update_data, upsert=True, ) if db_update_res["status"]: msg = f"Backup({self.backup_type}): {file_path} 上传成功!" # 保存当前文章状态 self.save_backup( doc_source=doc_source, doc_source_name=doc_source_name, doc_name=doc_name, ) else: msg = f"Backup({self.backup_type}): {file_path} 上传失败!{db_update_res['info']}" else: msg = f"Backup({self.backup_type}): {file_path} 已存在!" LOGGER.info(msg)
def send_post_request(url, data: dict = None, **kwargs) -> dict: """ 发起post请求 :param url: 请求目标地址 :param data: 请求参数 :param kwargs: :return: """ try: resp_dict = requests.post(url, data=json.dumps(data), **kwargs).json() except Exception as e: resp_dict = {} LOGGER.error(f"请求出错:{e}") return resp_dict
def send_factory(send_type: str, send_config: dict, send_data: dict) -> bool: """ 分发器工厂函数 :param send_type: 下发终端类型 :param send_config: 下发终端配置 :param send_data: 下发内容字典,字段开发者自定义 :return: """ send_status = False try: send_module = import_module(f"src.sender.{send_type}_sender") send_status = send_module.send(send_config, send_data) except ModuleNotFoundError: LOGGER.error(f"目标终端类型不存在 {send_type} - {send_config} - {send_data}") return send_status
def collect_factory(collect_type: str, collect_config: dict) -> bool: """ 采集器工厂函数 :param collect_type: 采集器类型 :param collect_config: 采集器配置 :return: """ collect_status = False try: collect_module = import_module(f"src.collector.{collect_type}") collect_status = collect_module.run(collect_config) except ModuleNotFoundError as e: LOGGER.error(f"采集器类型不存在 {collect_type} - {collect_config} -{e}") except Exception as e: LOGGER.error(f"采集器执行出错 {collect_type} - {collect_config} - {e}") return collect_status
def send_get_request(url: str, params: dict = None, timeout: int = 3, **kwargs): """发起GET请求 Args: url (str): 目标地址 params (dict, optional): 请求参数. Defaults to None. timeout (int, optional): 超时时间. Defaults to 3. Returns: [type]: [description] """ try: resp = requests.get(url, params, timeout=timeout, **kwargs) except Exception as e: resp = None LOGGER.exception(f"请求出错 - {url} - {str(e)}") return resp
def get_html_by_requests(url: str, params: dict = None, timeout: int = 3, **kwargs): """发起GET请求,获取文本 Args: url (str): 目标网页 params (dict, optional): 请求参数. Defaults to None. timeout (int, optional): 超时时间. Defaults to 3. """ resp = send_get_request(url=url, params=params, timeout=timeout, **kwargs) text = None try: content = resp.content charset = cchardet.detect(content) text = content.decode(charset["encoding"]) except Exception as e: LOGGER.exception(f"请求内容提取出错 - {url} - {str(e)}") return text
def send_text_card(self, send_data): """ 发送卡片消息 :param send_data: :return: """ doc_name = send_data["doc_name"] doc_source = send_data["doc_source"] doc_link = send_data["doc_link"] doc_content = send_data["doc_content"] doc_cus_des = send_data["doc_cus_des"] doc_source_name = send_data["doc_source_name"] doc_keywords = send_data["doc_keywords"] doc_ts = send_data["doc_ts"] doc_date = time.strftime("%Y-%m-%d", time.localtime(doc_ts)) doc_des_info = f"亲,来自 {doc_source} 源的 {doc_source_name} 有更新啦! \n\n{doc_content}\n\n文章关键字:{doc_keywords}" doc_des = f'<div class="black">{doc_date} | {doc_cus_des}</div>\n<div class="normal">{doc_des_info}</div>\n来自[2c]👉技术支持❤️' data = { "toparty": 1, "msgtype": "textcard", "agentid": self.wecom_agent_id, "textcard": { "title": f"[{doc_source_name}]{doc_name}", "description": doc_des, "url": doc_link, "btntxt": "更多", }, "safe": 0, } data = json.dumps(data, ensure_ascii=False) try: resp_dict = requests.post( url=self.url, data=data.encode("utf-8").decode("latin1"), headers={ "Content-Type": "application/json" }, ).json() return resp_dict except Exception as e: resp_dict = {} LOGGER.error(f"请求出错:{e}") return resp_dict
def backup_factory(backup_type: str, init_config: dict) -> BackupBase: """ 备份器工厂函数 :param backup_type: 备份类型 :param init_config: 备份配置 :return: """ backup_ins = None try: backup_class_name = f"{backup_type}_backup" backup_module = import_module(f"src.backup.{backup_class_name}") # 备份类实例化 backup_ins = getattr( backup_module, string_camelcase(backup_class_name))(init_config=init_config) except ModuleNotFoundError as e: LOGGER.error(f"目标备份类型不存在 {backup_type} - {init_config} - {e}") return backup_ins
def send_post_request(url: str, data: dict = None, timeout: int = 5, **kwargs) -> dict: """发起post请求 Args: url (str): 目标地址 data (dict, optional): 请求参数. Defaults to None. timeout (int, optional): 超时时间. Defaults to 5. Returns: dict: [description] """ try: resp_dict = requests.post( url, data=json.dumps(data), timeout=timeout, **kwargs ).json() except Exception as e: resp_dict = {} LOGGER.error(f"请求出错:{e}") return resp_dict
def send(self, send_data) -> bool: """ 下发到钉钉终端 :param send_data: 下发内容字典,字段开发者自定义 :return: """ doc_name = send_data["doc_name"] doc_cus_des = send_data["doc_cus_des"] doc_id = send_data["doc_id"] doc_link = send_data["doc_link"] doc_source_name = send_data["doc_source_name"] is_send = self.is_send(doc_id=doc_id) send_status = True if not is_send: # 开始进行下发 resp_dict = self.send_text_card(send_data=send_data) notice_msg = f"{doc_cus_des}👉{doc_source_name}_{doc_name}:{doc_link} 分发到 {self.send_type}" if resp_dict: if resp_dict.get("errcode") == 0: # 将状态持久化到数据库 self.sl_coll.insert_one({ "send_type": self.send_type, "doc_id": doc_id, "ts": time.time(), }) # 下发成功 LOGGER.info(f"{notice_msg} 成功!") send_status = True else: LOGGER.error(f"{notice_msg} 失败:{resp_dict.get('errmsg')}") else: LOGGER.error(f"{notice_msg} 失败!") return send_status
def send(self, send_data) -> bool: """ 下发到钉钉终端 :param send_data: 下发内容字典,字段开发者自定义 :return: """ doc_id = send_data["doc_id"] doc_name = send_data["doc_name"] doc_source = send_data["doc_source"] doc_link = send_data["doc_link"] doc_cus_des = send_data["doc_cus_des"] doc_source_name = send_data["doc_source_name"] doc_keywords = send_data["doc_keywords"] is_send = self.is_send(doc_id=doc_id) doc_date = send_data["doc_date"] send_status = True if not is_send: # 开始进行下发 # data = { # "msgtype": "link", # "link": { # "text": f"[liuli]{doc_source_name}: {doc_cus_des}\n亲,{doc_source} 源有更新\n", # "title": doc_name, # "picUrl": "", # "messageUrl": doc_link, # }, # } data = { "msgtype": "markdown", "markdown": { "text": f"## [{doc_name}]({doc_link})\n\n**{doc_source_name}** | **{doc_date}** | **{doc_cus_des}** \n\n-----\n\n> 文章关键字:{doc_keywords}\n\n-----\n\n识别错误?点击[广告反馈](https://github.com/howie6879/liuli/issues/4) 👉来自[liuli](https://github.com/howie6879/liuli)技术支持❤️", "title": f"亲,{doc_source} 源有更新啦!👉{doc_name} ", }, } resp_dict = send_post_request( url=self.url, data=data, headers={"Content-Type": "application/json"}) notice_msg = f"{doc_cus_des}👉{doc_source_name}_{doc_name}:{doc_link} 分发到 {self.send_type}" if resp_dict: if resp_dict.get("errmsg") == "ok": # 将状态持久化到数据库 self.sl_coll.insert_one({ "send_type": self.send_type, "doc_id": doc_id, "ts": int(time.time()), }) # 下发成功 LOGGER.info(f"{notice_msg} 成功!") else: LOGGER.error(f"{notice_msg} 失败:{resp_dict.get('errmsg')}") send_status = False else: LOGGER.error(f"{notice_msg} 失败!") send_status = False return send_status
def save(self, backup_data: dict) -> bool: """执行备份动作 Args: backup_data (dict): 备份数据 Returns: bool: 是否成功 """ # 以下字段必须存在 doc_source = backup_data["doc_source"] doc_source_name = backup_data["doc_source_name"] doc_name = backup_data["doc_name"] # 源文件 doc_html = backup_data["doc_html"] file_path = f"{doc_source}/{doc_source_name}/{doc_name}.html" is_backup = self.is_backup( doc_source=doc_source, doc_source_name=doc_source_name, doc_name=doc_name, ) # 在数据库存在就默认线上必定存在,希望用户不操作这个仓库造成状态不同步 if not is_backup or self.force_backup: # 上传前做是否存在检测,没有备份过继续远程备份 # 已存在的但是数据库没有状态需要重新同步 try: # 先判断文件是否存在 try: contents = self.repo.get_contents(file_path) # 存在就更新 self.repo.update_file(contents.path, f"Update {file_path}", doc_html, contents.sha) except Exception as e: # 不存在就上传 self.repo.create_file(file_path, f"Add {file_path}", doc_html) LOGGER.info(f"Backup({self.backup_type}): {file_path} 上传成功!") # 保存当前文章状态 self.save_backup( doc_source=doc_source, doc_source_name=doc_source_name, doc_name=doc_name, ) except GithubException as e: LOGGER.error( f"Backup({self.backup_type}): {file_path} 上传失败!{e}") else: LOGGER.info(f"Backup({self.backup_type}): {file_path} 已存在!")
def post(self): from flask_restful import request LOGGER.info(f'Received request {request}') LOGGER.info(f'Request JSON data: {request.get_json()}') message_attrs = parse_request(SlackMessageSchema, request) token = self.get_token() slack_client = self.setup_slack_client(token) response = slack_client.chat_postMessage(**message_attrs) LOGGER.info(f'Slack client response: {response}') return make_response( jsonify(code=response.status_code, data=response.data), response.status_code)
def send(self, send_data) -> bool: """ 下发到Telegram :param send_data: 下发内容字典,字段开发者自定义 :return: """ doc_id = send_data["doc_id"] doc_name = send_data["doc_name"] doc_link = send_data["doc_link"] doc_cus_des = send_data["doc_cus_des"] doc_source_name = send_data["doc_source_name"] is_send = self.is_send(doc_id=doc_id) send_status = True if not is_send: message = TG_BOT_MSG_TEMPLATE.format_map(send_data) data = { "chat_id": self.chat_id, "text": message, "parse_mode": "HTML", "disable_web_page_preview": "yes", } resp_dict = send_post_request( url=self.url, data=data, headers={"Content-Type": "application/json"}, timeout=5, ) notice_msg = f"{doc_cus_des}👉{doc_source_name}_{doc_name}:{doc_link} 分发到 {self.send_type}" if resp_dict: if resp_dict.get("ok") is True: # 将状态持久化到数据库 self.sl_coll.insert_one({ "send_type": self.send_type, "doc_id": doc_id, "ts": int(time.time()), }) # 下发成功 LOGGER.info(f"{notice_msg} 成功!") else: LOGGER.error(f"{notice_msg} 失败:{resp_dict.get('errmsg')}") send_status = False else: LOGGER.error(f"{notice_msg} 失败!") send_status = False return send_status
def run_liuli_task(ll_config: dict): """执行调度任务 Args: ll_config (dict): Liuli 任务配置 """ try: # 防止内部函数篡改 ll_config_data = deepcopy(ll_config) # 文章源, 用于基础查询条件 doc_source: str = ll_config_data["doc_source"] basic_filter = {"basic_filter": {"doc_source": doc_source}} # 采集器配置 collector_conf: dict = ll_config_data["collector"] # 处理器配置 processor_conf: dict = ll_config_data["processor"] # 分发器配置 sender_conf: dict = ll_config_data["sender"] sender_conf.update(basic_filter) # 备份器配置 backup_conf: dict = ll_config_data["backup"] backup_conf.update(basic_filter) # 采集器执行 LOGGER.info("采集器开始执行!") for collect_type, collect_config in collector_conf.items(): collect_factory(collect_type, collect_config) LOGGER.info("采集器执行完毕!") # 采集器执行 LOGGER.info("处理器(after_collect): 开始执行!") for each in processor_conf["after_collect"]: func_name = each.get("func") # 注入查询条件 each.update(basic_filter) LOGGER.info(f"处理器(after_collect): {func_name} 正在执行...") processor_dict[func_name](**each) LOGGER.info("处理器(after_collect): 执行完毕!") # 分发器执行 LOGGER.info("分发器开始执行!") send_doc(sender_conf) LOGGER.info("分发器执行完毕!") # 备份器执行 LOGGER.info("备份器开始执行!") backup_doc(backup_conf) LOGGER.info("备份器执行完毕!") except Exception as e: LOGGER.error(f"执行失败!{e}")
from src.utils import LOGGER def schedule_task(): """ 更新持久化订阅的公众号最新文章 :return: """ # 抓取最新的文章,然后持久化到数据库 update_wechat_doc() # 更新广告标签 update_ads_tag() # 文章分发 send_doc() if __name__ == "__main__": # 初次启动执行即可 run_wechat_name_spider() # 每日抓取公众号最新文章并更新广告标签 schedule.every().day.at("07:10").do(schedule_task) schedule.every().day.at("11:10").do(schedule_task) schedule.every().day.at("16:10").do(schedule_task) schedule.every().day.at("20:10").do(schedule_task) schedule.every().day.at("23:10").do(schedule_task) LOGGER.info("Schedule started successfully :)") LOGGER.info("Schedule time:\n 07:10 \n 11:10 \n 16:10 \n 20:10 \n 23:10") while True: schedule.run_pending() time.sleep(1)