def send_doc(): """ 对文章进行分发 :return: """ mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) coll = mongo_base.get_collection(coll_name="2c_articles") cur_ts = time.time() filter_dict = { # 时间范围,除第一次外后面其实可以去掉 "doc_ts": { "$gte": cur_ts - (2 * 24 * 60 * 60), "$lte": cur_ts }, # 至少打上一个模型标签 "cos_model": { "$exists": True }, } # 查找所有可分发文章 for each_data in coll.find(filter_dict): # 分别分发给各个目标 for send_type in Config.SENDER_LIST: # 暂时固定,测试 send_config = {} each_data["doc_cus_des"] = "🤓非广告" cos_model_resp = each_data["cos_model"] if cos_model_resp["result"] == 1: # 广告标记 each_data[ "doc_cus_des"] = f"👿广告[概率:{cos_model_resp['probability']}]" send_factory(send_type=send_type, send_config=send_config, send_data=each_data)
def wechat2url(name_list: list, source_type: str = "github"): """ 将微信名称转为 wechat-feeds 对应的url updated: - 21-05-11: https://github.com/hellodword/wechat-feeds 去除 gitee 支持 :param name_list: :param source_type: :return: """ mongo_base = MongodbManager.get_mongo_base(mongodb_config=Config.MONGODB_CONFIG) coll = mongo_base.get_collection(coll_name="2c_wechat_name") if source_type == "github": rss_tem = "https://github.com/hellodword/wechat-feeds/raw/feeds/{0}.xml" elif source_type == "gitee": rss_tem = "https://gitee.com/BlogZ/wechat-feeds/raw/feeds/{0}.xml" else: # 否则使用 github rss_tem = "https://github.com/hellodword/wechat-feeds/raw/feeds/{0}.xml" res_dict = {} for each in coll.find({"name": {"$in": name_list}}): rss_url = rss_tem.format(each["bizid"]) res_dict[each["name"]] = rss_url return res_dict
def create_app(): """ 建立web应用 url: http://flask.pocoo.org/docs/1.0/quickstart/ :return: """ flask_app = Flask(__name__) with flask_app.app_context(): # 项目内部配置 LOGGER = get_logger("Liuli API") mongodb_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) flask_app.config["app_config"] = Config flask_app.config["app_logger"] = LOGGER flask_app.config["mongodb_base"] = mongodb_base LOGGER.info(f"server({Config.API_VERSION}) started successfully :)") # 注册相关蓝图 flask_app.register_blueprint(bp_api_v1) flask_app.register_blueprint(bp_rss) flask_app.register_blueprint(bp_backup) # 初始化JWT flask_app.config["JWT_SECRET_KEY"] = Config.JWT_SECRET_KEY _ = JWTManager(flask_app) return flask_app
def gen_normal_sample(nums: int = None): """ 生成正常样本数据 :param nums: 样本数量,默认数量取异常样本 :return: """ if nums is None: ads_path = os.path.join(Config.DS_DIR, "clean_ads.csv") ads_title_list = load_text_to_list(ads_path) nums = len(ads_title_list) - 1 mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) coll = mongo_base.get_collection(coll_name="2c_articles") query = {"cos_model.result": 0, "doc_source_name": "真没什么逻辑"} normal_path = os.path.join(Config.DS_DIR, "normal.csv") for each_data in coll.aggregate([{ "$match": query }, { "$sample": { "size": nums } }]): title = each_data["doc_name"] url = each_data["doc_link"] title = f'"{title}"' if "," in title else title info = f"{title},{url},0\n" print(info)
def create_app(): """ 建立web应用 url: http://flask.pocoo.org/docs/1.0/quickstart/ :return: """ flask_app = Flask(__name__) with flask_app.app_context(): # 项目内部配置 mongodb_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) app_loop = asyncio.get_event_loop() flask_app.config["app_config"] = Config flask_app.config["app_logger"] = LOGGER flask_app.config["app_loop"] = app_loop flask_app.config["mongodb_base"] = mongodb_base # 每次启动先保证公众号名称爬虫运行成功 # spider = run_wechat_name_spider(loop=app_loop) # if spider.success_counts == 1: # # 爬虫运行成功 # LOGGER.info("Wechat spider started successfully :)") LOGGER.info("API started successfully :)") flask_app.register_blueprint(bp_api) return flask_app
def ad_marker( cos_value: float = 0.6, is_force=False, basic_filter=None, **kwargs, ): """对订阅的文章进行广告标记 Args: cos_value (str): 0.6 basic_filter (dict): {} 查询条件 is_force (bool): 是否强制重新判决 """ mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) coll = mongo_base.get_collection(coll_name="liuli_articles") if is_force: query = {} else: query = {"cos_model": {"$exists": False}} query.update(basic_filter or {}) # 查找没有被标记的文章,基于相似度模型进行判断 for each_data in coll.find(query): doc_name = each_data["doc_name"] doc_source_name = each_data["doc_source_name"] doc_content = each_data["doc_content"] doc_keywords = each_data.get("doc_keywords") if not doc_keywords: keyword_list = extract_keyword_list(doc_content) doc_keywords = " ".join(keyword_list) each_data["doc_keywords"] = doc_keywords # 基于余弦相似度 cos_model_resp = model_predict_factory( model_name="cos", model_path="", input_dict={ "text": doc_name + doc_keywords, "cos_value": cos_value }, # input_dict={"text": doc_name, "cos_value": Config.COS_VALUE}, ).to_dict() each_data["cos_model"] = cos_model_resp if cos_model_resp["result"] == 1: LOGGER.info( f"[{doc_source_name}] {doc_name} 被识别为广告[{cos_model_resp['probability']}],链接为:{each_data['doc_link']}" ) coll.update_one( filter={"doc_id": each_data["doc_id"]}, update={"$set": each_data}, upsert=True, )
def test_mongo_doc(): """ 测试数据库文本 """ mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) # coll = mongo_base.get_collection(coll_name="liuli_articles") coll = mongo_base.get_collection(coll_name="liuli_wechat_datasets") for each in coll.find({}): doc_name = each["doc_name"] model_resp = cos_pre(text=doc_name) probability = model_resp["probability"] if model_resp["result"] >= 0.5 and probability != 1.0: print(f"{doc_name} 被识别为广告[{probability}],链接为:{each['doc_link']}")
def __init__(self, send_type: str, send_config: dict): """ 初始化相关配置 :param send_type: 下发目标类型 :param send_config: 下发目标类型相关配置,如密钥之类 """ self.send_type = send_type self.send_config = send_config # 初始化数据库 self.mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG ) # 2c_send_list 存储所有已经下发过的文章列表,可以当做缓存表 self.sl_coll = self.mongo_base.get_collection(coll_name="2c_send_list")
def __init__(self, backup_type: str, init_config: dict): """ 初始化相关配置 :param backup_type: 下发目标类型 :param init_config: 下发目标类型相关配置,如密钥之类 """ self.backup_type = backup_type self.init_config = init_config # 初始化数据库 self.mongo_base: MongodbBase = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG ) # liuli_send_list 存储所有已经备份过的文章列表 self.bak_coll = self.mongo_base.get_collection(coll_name="liuli_backup_list")
def update_ads_tag(is_force=False): """ 对订阅的文章进行广告标记 :param is_force: 是否强制重新判决 :return: """ mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) coll = mongo_base.get_collection(coll_name="2c_articles") if is_force: query = {} else: query = {"cos_model": {"$exists": False}} # 查找没有被标记的文章,基于预先相似度模型进行判断 for each_data in coll.find(query): doc_name = each_data["doc_name"] doc_link = each_data["doc_link"] doc_source_name = each_data["doc_source_name"] doc_keywords = each_data.get("doc_keywords") if not doc_keywords: keyword_list = fetch_keyword_list(doc_link) doc_keywords = " ".join(keyword_list) each_data["doc_keywords"] = doc_keywords # 基于余弦相似度 cos_model_resp = model_predict_factory( model_name="cos", model_path="", input_dict={ "text": doc_name + doc_keywords, "cos_value": Config.COS_VALUE }, # input_dict={"text": doc_name, "cos_value": Config.COS_VALUE}, ).to_dict() each_data["cos_model"] = cos_model_resp if cos_model_resp["result"] == 1: LOGGER.info( f"[{doc_source_name}] {doc_name} 被识别为广告[{cos_model_resp['probability']}],链接为:{each_data['doc_link']}" ) coll.update_one( filter={"doc_id": each_data["doc_id"]}, update={"$set": each_data}, upsert=True, )
def send_doc(sender_conf: dict): """ 对文章进行分发 Args: sender_conf (dict): 分发配置 """ sender_list = sender_conf["sender_list"] query_days = sender_conf.get("query_days", 2) delta_time = sender_conf.get("delta_time", 3) link_source = sender_conf.get("link_source", "self") basic_filter = sender_conf.get("basic_filter", {}) ignore_doc_source_name = sender_conf.get("ignore_doc_source_name", []) skip_ads = sender_conf.get("skip_ads", False) if sender_list: # 是否启用分发器 mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) coll = mongo_base.get_collection(coll_name="liuli_articles") # 分别分发给各个目标 for send_type in sender_list: # 构建查询条件 cur_ts = int(time.time()) custom_filter = sender_conf.get("custom_filter", {}).get(send_type, {}) query_days = custom_filter.get("query_days", query_days) delta_time = custom_filter.get("delta_time", delta_time) link_source = custom_filter.get("link_source", link_source) skip_ads = custom_filter.get("skip_ads", skip_ads) ignore_doc_source_name = custom_filter.get( "ignore_doc_source_name", ignore_doc_source_name) filter_dict = { **basic_filter, **{ # 时间范围,除第一次外后面其实可以去掉 "doc_ts": { "$gte": cur_ts - (query_days * 24 * 60 * 60), "$lte": cur_ts, }, # 过滤文档源名称 "doc_source_name": { "$nin": ignore_doc_source_name }, }, } if skip_ads: filter_dict.update({ # 至少打上一个模型标签 "cos_model": { "$exists": True }, # 判定结果为非广告 "cos_model.result": 1, }) # 查找所有可分发文章 for each_data in coll.find(filter_dict): # 暂时固定,测试 init_config = sender_conf.get(f"{send_type}_init_config", {}) cos_model_resp = each_data.get("cos_model", {}) doc_cus_des = "" if cos_model_resp and skip_ads: # 经过模型判断 if cos_model_resp["result"] == 1: # 广告标记 doc_cus_des = f"👿广告[概率:{cos_model_resp['probability']}]" else: doc_cus_des = "🤓非广告" each_data["doc_cus_des"] = doc_cus_des each_data["doc_link"] = get_bak_doc_link( link_source=link_source, doc_data=each_data) # 每次分发休眠一定时间 time.sleep(delta_time) send_factory(send_type=send_type, init_config=init_config, send_data=each_data) else: LOGGER.error()("未配置分发器!")
def backup_doc(backup_config: dict): """对文章进行备份 Args: backup_config (dict): 备份配置 """ backup_list = backup_config["backup_list"] query_days = backup_config.get("query_days", 2) delta_time = backup_config.get("delta_time", 3) basic_filter = backup_config.get("basic_filter", {}) doc_html_dict = backup_config.get("doc_html_dict", {}) init_config = backup_config.get("init_config", {}) after_get_content = backup_config.get("after_get_content", []) if backup_list: mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) coll = mongo_base.get_collection(coll_name="liuli_articles") cur_ts = int(time.time()) filter_dict = { **basic_filter, **{ # 时间范围,除第一次外后面其实可以去掉 "doc_ts": { "$gte": cur_ts - (query_days * 24 * 60 * 60), "$lte": cur_ts } }, } db_res = mongodb_find( coll_conn=coll, filter_dict=filter_dict, return_dict={ "_id": 0, "doc_source": 1, "doc_source_name": 1, "doc_core_html": 1, "doc_html": 1, "doc_name": 1, "doc_link": 1, }, ) if db_res["status"]: # 查找所有可备份文章 for each_data in db_res["info"]: for each in backup_list: # 每次备份休眠一定时间 time.sleep(delta_time) backup_ins = backup_factory(backup_type=each, init_config=init_config) # 获取文档源 doc_source = each_data["doc_source"] # 获取最终存储数据 doc_html = get_bak_doc_html( doc_data=each_data, doc_html_type=doc_html_dict.get(doc_source, "default"), ) # 执行获取文本后的钩子函数 for func_dict in after_get_content: cur_func_dict = deepcopy(func_dict) func_name = cur_func_dict.pop("func") LOGGER.info( f"处理器(backup:after_get_content): {func_name} 正在执行..." ) cur_func_dict.update({"text": doc_html}) doc_html = processor_dict[func_name](**cur_func_dict) # 进行保存动作 each_data["doc_html"] = doc_html backup_ins.save(each_data) else: LOGGER.error(f"Backup 数据查询失败! {db_res['info']}") else: LOGGER.error("Backup 未配置备份源!")