Esempio n. 1
0
def send_doc():
    """
    对文章进行分发
    :return:
    """
    mongo_base = MongodbManager.get_mongo_base(
        mongodb_config=Config.MONGODB_CONFIG)
    coll = mongo_base.get_collection(coll_name="2c_articles")
    cur_ts = time.time()
    filter_dict = {
        # 时间范围,除第一次外后面其实可以去掉
        "doc_ts": {
            "$gte": cur_ts - (2 * 24 * 60 * 60),
            "$lte": cur_ts
        },
        # 至少打上一个模型标签
        "cos_model": {
            "$exists": True
        },
    }
    # 查找所有可分发文章
    for each_data in coll.find(filter_dict):
        # 分别分发给各个目标
        for send_type in Config.SENDER_LIST:
            # 暂时固定,测试
            send_config = {}
            each_data["doc_cus_des"] = "🤓非广告"
            cos_model_resp = each_data["cos_model"]
            if cos_model_resp["result"] == 1:
                # 广告标记
                each_data[
                    "doc_cus_des"] = f"👿广告[概率:{cos_model_resp['probability']}]"
            send_factory(send_type=send_type,
                         send_config=send_config,
                         send_data=each_data)
Esempio n. 2
0
def wechat2url(name_list: list, source_type: str = "github"):
    """
    将微信名称转为 wechat-feeds 对应的url
    updated:
        - 21-05-11: https://github.com/hellodword/wechat-feeds 去除 gitee 支持
    :param name_list:
    :param source_type:
    :return:
    """
    mongo_base = MongodbManager.get_mongo_base(mongodb_config=Config.MONGODB_CONFIG)
    coll = mongo_base.get_collection(coll_name="2c_wechat_name")
    if source_type == "github":
        rss_tem = "https://github.com/hellodword/wechat-feeds/raw/feeds/{0}.xml"

    elif source_type == "gitee":
        rss_tem = "https://gitee.com/BlogZ/wechat-feeds/raw/feeds/{0}.xml"

    else:
        # 否则使用 github
        rss_tem = "https://github.com/hellodword/wechat-feeds/raw/feeds/{0}.xml"

    res_dict = {}
    for each in coll.find({"name": {"$in": name_list}}):
        rss_url = rss_tem.format(each["bizid"])
        res_dict[each["name"]] = rss_url
    return res_dict
Esempio n. 3
0
def create_app():
    """
    建立web应用
    url: http://flask.pocoo.org/docs/1.0/quickstart/
    :return:
    """
    flask_app = Flask(__name__)

    with flask_app.app_context():
        # 项目内部配置
        LOGGER = get_logger("Liuli API")
        mongodb_base = MongodbManager.get_mongo_base(
            mongodb_config=Config.MONGODB_CONFIG)
        flask_app.config["app_config"] = Config
        flask_app.config["app_logger"] = LOGGER
        flask_app.config["mongodb_base"] = mongodb_base

        LOGGER.info(f"server({Config.API_VERSION}) started successfully :)")

    # 注册相关蓝图
    flask_app.register_blueprint(bp_api_v1)
    flask_app.register_blueprint(bp_rss)
    flask_app.register_blueprint(bp_backup)

    # 初始化JWT
    flask_app.config["JWT_SECRET_KEY"] = Config.JWT_SECRET_KEY
    _ = JWTManager(flask_app)

    return flask_app
Esempio n. 4
0
def gen_normal_sample(nums: int = None):
    """
    生成正常样本数据
    :param nums: 样本数量,默认数量取异常样本
    :return:
    """
    if nums is None:
        ads_path = os.path.join(Config.DS_DIR, "clean_ads.csv")
        ads_title_list = load_text_to_list(ads_path)
        nums = len(ads_title_list) - 1
    mongo_base = MongodbManager.get_mongo_base(
        mongodb_config=Config.MONGODB_CONFIG)
    coll = mongo_base.get_collection(coll_name="2c_articles")
    query = {"cos_model.result": 0, "doc_source_name": "真没什么逻辑"}
    normal_path = os.path.join(Config.DS_DIR, "normal.csv")
    for each_data in coll.aggregate([{
            "$match": query
    }, {
            "$sample": {
                "size": nums
            }
    }]):
        title = each_data["doc_name"]
        url = each_data["doc_link"]
        title = f'"{title}"' if "," in title else title
        info = f"{title},{url},0\n"
        print(info)
Esempio n. 5
0
def create_app():
    """
    建立web应用
    url: http://flask.pocoo.org/docs/1.0/quickstart/
    :return:
    """
    flask_app = Flask(__name__)

    with flask_app.app_context():
        # 项目内部配置
        mongodb_base = MongodbManager.get_mongo_base(
            mongodb_config=Config.MONGODB_CONFIG)
        app_loop = asyncio.get_event_loop()
        flask_app.config["app_config"] = Config
        flask_app.config["app_logger"] = LOGGER
        flask_app.config["app_loop"] = app_loop
        flask_app.config["mongodb_base"] = mongodb_base

        # 每次启动先保证公众号名称爬虫运行成功
        # spider = run_wechat_name_spider(loop=app_loop)
        # if spider.success_counts == 1:
        #     # 爬虫运行成功
        #     LOGGER.info("Wechat spider started successfully :)")
        LOGGER.info("API started successfully :)")

    flask_app.register_blueprint(bp_api)
    return flask_app
Esempio n. 6
0
def ad_marker(
    cos_value: float = 0.6,
    is_force=False,
    basic_filter=None,
    **kwargs,
):
    """对订阅的文章进行广告标记

    Args:
        cos_value (str): 0.6
        basic_filter (dict): {} 查询条件
        is_force (bool): 是否强制重新判决
    """
    mongo_base = MongodbManager.get_mongo_base(
        mongodb_config=Config.MONGODB_CONFIG)
    coll = mongo_base.get_collection(coll_name="liuli_articles")
    if is_force:
        query = {}
    else:
        query = {"cos_model": {"$exists": False}}

    query.update(basic_filter or {})

    # 查找没有被标记的文章,基于相似度模型进行判断
    for each_data in coll.find(query):
        doc_name = each_data["doc_name"]
        doc_source_name = each_data["doc_source_name"]
        doc_content = each_data["doc_content"]
        doc_keywords = each_data.get("doc_keywords")

        if not doc_keywords:
            keyword_list = extract_keyword_list(doc_content)
            doc_keywords = " ".join(keyword_list)
            each_data["doc_keywords"] = doc_keywords

        # 基于余弦相似度
        cos_model_resp = model_predict_factory(
            model_name="cos",
            model_path="",
            input_dict={
                "text": doc_name + doc_keywords,
                "cos_value": cos_value
            },
            # input_dict={"text": doc_name, "cos_value": Config.COS_VALUE},
        ).to_dict()
        each_data["cos_model"] = cos_model_resp
        if cos_model_resp["result"] == 1:
            LOGGER.info(
                f"[{doc_source_name}] {doc_name} 被识别为广告[{cos_model_resp['probability']}],链接为:{each_data['doc_link']}"
            )
        coll.update_one(
            filter={"doc_id": each_data["doc_id"]},
            update={"$set": each_data},
            upsert=True,
        )
Esempio n. 7
0
def test_mongo_doc():
    """
    测试数据库文本
    """
    mongo_base = MongodbManager.get_mongo_base(
        mongodb_config=Config.MONGODB_CONFIG)
    # coll = mongo_base.get_collection(coll_name="liuli_articles")
    coll = mongo_base.get_collection(coll_name="liuli_wechat_datasets")
    for each in coll.find({}):
        doc_name = each["doc_name"]
        model_resp = cos_pre(text=doc_name)
        probability = model_resp["probability"]
        if model_resp["result"] >= 0.5 and probability != 1.0:
            print(f"{doc_name} 被识别为广告[{probability}],链接为:{each['doc_link']}")
Esempio n. 8
0
 def __init__(self, send_type: str, send_config: dict):
     """
     初始化相关配置
     :param send_type: 下发目标类型
     :param send_config: 下发目标类型相关配置,如密钥之类
     """
     self.send_type = send_type
     self.send_config = send_config
     # 初始化数据库
     self.mongo_base = MongodbManager.get_mongo_base(
         mongodb_config=Config.MONGODB_CONFIG
     )
     # 2c_send_list 存储所有已经下发过的文章列表,可以当做缓存表
     self.sl_coll = self.mongo_base.get_collection(coll_name="2c_send_list")
Esempio n. 9
0
File: base.py Progetto: howie6879/2c
 def __init__(self, backup_type: str, init_config: dict):
     """
     初始化相关配置
     :param backup_type: 下发目标类型
     :param init_config: 下发目标类型相关配置,如密钥之类
     """
     self.backup_type = backup_type
     self.init_config = init_config
     # 初始化数据库
     self.mongo_base: MongodbBase = MongodbManager.get_mongo_base(
         mongodb_config=Config.MONGODB_CONFIG
     )
     # liuli_send_list 存储所有已经备份过的文章列表
     self.bak_coll = self.mongo_base.get_collection(coll_name="liuli_backup_list")
Esempio n. 10
0
def update_ads_tag(is_force=False):
    """
    对订阅的文章进行广告标记
    :param is_force: 是否强制重新判决
    :return:
    """
    mongo_base = MongodbManager.get_mongo_base(
        mongodb_config=Config.MONGODB_CONFIG)
    coll = mongo_base.get_collection(coll_name="2c_articles")
    if is_force:
        query = {}
    else:
        query = {"cos_model": {"$exists": False}}

    # 查找没有被标记的文章,基于预先相似度模型进行判断
    for each_data in coll.find(query):
        doc_name = each_data["doc_name"]
        doc_link = each_data["doc_link"]
        doc_source_name = each_data["doc_source_name"]
        doc_keywords = each_data.get("doc_keywords")

        if not doc_keywords:
            keyword_list = fetch_keyword_list(doc_link)
            doc_keywords = " ".join(keyword_list)
            each_data["doc_keywords"] = doc_keywords

        # 基于余弦相似度
        cos_model_resp = model_predict_factory(
            model_name="cos",
            model_path="",
            input_dict={
                "text": doc_name + doc_keywords,
                "cos_value": Config.COS_VALUE
            },
            # input_dict={"text": doc_name, "cos_value": Config.COS_VALUE},
        ).to_dict()
        each_data["cos_model"] = cos_model_resp
        if cos_model_resp["result"] == 1:
            LOGGER.info(
                f"[{doc_source_name}] {doc_name} 被识别为广告[{cos_model_resp['probability']}],链接为:{each_data['doc_link']}"
            )
        coll.update_one(
            filter={"doc_id": each_data["doc_id"]},
            update={"$set": each_data},
            upsert=True,
        )
Esempio n. 11
0
def send_doc(sender_conf: dict):
    """
    对文章进行分发
    Args:
        sender_conf (dict): 分发配置
    """
    sender_list = sender_conf["sender_list"]
    query_days = sender_conf.get("query_days", 2)
    delta_time = sender_conf.get("delta_time", 3)
    link_source = sender_conf.get("link_source", "self")
    basic_filter = sender_conf.get("basic_filter", {})
    ignore_doc_source_name = sender_conf.get("ignore_doc_source_name", [])
    skip_ads = sender_conf.get("skip_ads", False)
    if sender_list:
        # 是否启用分发器
        mongo_base = MongodbManager.get_mongo_base(
            mongodb_config=Config.MONGODB_CONFIG)
        coll = mongo_base.get_collection(coll_name="liuli_articles")

        # 分别分发给各个目标
        for send_type in sender_list:
            # 构建查询条件
            cur_ts = int(time.time())
            custom_filter = sender_conf.get("custom_filter",
                                            {}).get(send_type, {})
            query_days = custom_filter.get("query_days", query_days)
            delta_time = custom_filter.get("delta_time", delta_time)
            link_source = custom_filter.get("link_source", link_source)
            skip_ads = custom_filter.get("skip_ads", skip_ads)
            ignore_doc_source_name = custom_filter.get(
                "ignore_doc_source_name", ignore_doc_source_name)
            filter_dict = {
                **basic_filter,
                **{
                    # 时间范围,除第一次外后面其实可以去掉
                    "doc_ts": {
                        "$gte": cur_ts - (query_days * 24 * 60 * 60),
                        "$lte": cur_ts,
                    },
                    # 过滤文档源名称
                    "doc_source_name": {
                        "$nin": ignore_doc_source_name
                    },
                },
            }
            if skip_ads:
                filter_dict.update({
                    # 至少打上一个模型标签
                    "cos_model": {
                        "$exists": True
                    },
                    # 判定结果为非广告
                    "cos_model.result": 1,
                })
            # 查找所有可分发文章
            for each_data in coll.find(filter_dict):
                # 暂时固定,测试
                init_config = sender_conf.get(f"{send_type}_init_config", {})
                cos_model_resp = each_data.get("cos_model", {})
                doc_cus_des = ""
                if cos_model_resp and skip_ads:
                    # 经过模型判断
                    if cos_model_resp["result"] == 1:
                        # 广告标记
                        doc_cus_des = f"👿广告[概率:{cos_model_resp['probability']}]"
                    else:
                        doc_cus_des = "🤓非广告"

                each_data["doc_cus_des"] = doc_cus_des
                each_data["doc_link"] = get_bak_doc_link(
                    link_source=link_source, doc_data=each_data)
                # 每次分发休眠一定时间
                time.sleep(delta_time)
                send_factory(send_type=send_type,
                             init_config=init_config,
                             send_data=each_data)
    else:
        LOGGER.error()("未配置分发器!")
Esempio n. 12
0
def backup_doc(backup_config: dict):
    """对文章进行备份

    Args:
        backup_config (dict): 备份配置
    """
    backup_list = backup_config["backup_list"]
    query_days = backup_config.get("query_days", 2)
    delta_time = backup_config.get("delta_time", 3)
    basic_filter = backup_config.get("basic_filter", {})
    doc_html_dict = backup_config.get("doc_html_dict", {})
    init_config = backup_config.get("init_config", {})
    after_get_content = backup_config.get("after_get_content", [])
    if backup_list:
        mongo_base = MongodbManager.get_mongo_base(
            mongodb_config=Config.MONGODB_CONFIG)
        coll = mongo_base.get_collection(coll_name="liuli_articles")
        cur_ts = int(time.time())
        filter_dict = {
            **basic_filter,
            **{
                # 时间范围,除第一次外后面其实可以去掉
                "doc_ts": {
                    "$gte": cur_ts - (query_days * 24 * 60 * 60),
                    "$lte": cur_ts
                }
            },
        }
        db_res = mongodb_find(
            coll_conn=coll,
            filter_dict=filter_dict,
            return_dict={
                "_id": 0,
                "doc_source": 1,
                "doc_source_name": 1,
                "doc_core_html": 1,
                "doc_html": 1,
                "doc_name": 1,
                "doc_link": 1,
            },
        )

        if db_res["status"]:
            # 查找所有可备份文章
            for each_data in db_res["info"]:
                for each in backup_list:
                    # 每次备份休眠一定时间
                    time.sleep(delta_time)
                    backup_ins = backup_factory(backup_type=each,
                                                init_config=init_config)
                    # 获取文档源
                    doc_source = each_data["doc_source"]
                    # 获取最终存储数据
                    doc_html = get_bak_doc_html(
                        doc_data=each_data,
                        doc_html_type=doc_html_dict.get(doc_source, "default"),
                    )
                    # 执行获取文本后的钩子函数
                    for func_dict in after_get_content:
                        cur_func_dict = deepcopy(func_dict)
                        func_name = cur_func_dict.pop("func")
                        LOGGER.info(
                            f"处理器(backup:after_get_content): {func_name} 正在执行..."
                        )
                        cur_func_dict.update({"text": doc_html})
                        doc_html = processor_dict[func_name](**cur_func_dict)
                    # 进行保存动作
                    each_data["doc_html"] = doc_html
                    backup_ins.save(each_data)
        else:
            LOGGER.error(f"Backup 数据查询失败! {db_res['info']}")
    else:
        LOGGER.error("Backup 未配置备份源!")