コード例 #1
0
def run(collect_config: dict):
    """微信公众号文章抓取爬虫

    Args:
        collect_config (dict, optional): 采集器配置
    """
    s_nums = 0
    wechat_list = collect_config["wechat_list"]
    delta_time = collect_config.get("delta_time", 3)
    for wechat_name in wechat_list:
        SGWechatSpider.wechat_name = wechat_name
        SGWechatSpider.request_config = {
            "RETRIES": 3,
            "DELAY": delta_time,
            "TIMEOUT": 5,
        }
        t_url = f"https://weixin.sogou.com/weixin?type=1&query={wechat_name}&ie=utf8&s_from=input&_sug_=n&_sug_type_="
        SGWechatSpider.start_urls = [t_url]
        try:
            SGWechatSpider.start(middleware=ua_middleware)
            s_nums += 1
        except Exception as e:
            err_msg = f"😿 公众号->{wechat_name} 文章更新失败! 错误信息: {e}"
            LOGGER.error(err_msg)

    msg = f"🤗 微信公众号文章更新完毕({s_nums}/{len(wechat_list)})!"
    LOGGER.info(msg)
コード例 #2
0
def load_data_to_articlles(input_data: dict):
    """
    将获取的文章数据并持久化到 liuli_articles
    """
    # 抓取状态
    flag = False
    doc_source_name = input_data.get("doc_source_name")
    doc_source = input_data.get("doc_source")
    doc_name = input_data.get("doc_name")

    copy_input_data = deepcopy(input_data)
    copy_input_data["doc_ts"] = int(
        copy_input_data.get("doc_ts", int(time.time())))
    if doc_source_name and doc_source and doc_name:
        # 抓取成功进行持久化
        mongo_base = MongodbManager.get_mongo_base(
            mongodb_config=Config.MONGODB_CONFIG)
        coll_conn = mongo_base.get_collection(coll_name="liuli_articles")
        filter_dict = {"doc_id": copy_input_data["doc_id"]}
        update_data = {"$set": copy_input_data}
        db_res = mongodb_update_data(
            coll_conn=coll_conn,
            filter_dict=filter_dict,
            update_data=update_data,
            upsert=True,
        )
        if db_res["status"]:
            msg = f"来自 {doc_source} 的文章持久化成功! 👉 {doc_source_name}: {doc_name} "
            flag = True
        else:
            msg = f"来自 {doc_source} 的文章持久化失败! 👉 {doc_source_name} {db_res['info']}"
    else:
        msg = f"来自 {doc_source} 的文章抓取失败! 👉 {doc_source}/{doc_source_name}/{doc_name} "
    LOGGER.info(msg)
    return flag
コード例 #3
0
ファイル: text_utils.py プロジェクト: howie6879/2c
def ad_marker(
    cos_value: float = 0.6,
    is_force=False,
    basic_filter=None,
    **kwargs,
):
    """对订阅的文章进行广告标记

    Args:
        cos_value (str): 0.6
        basic_filter (dict): {} 查询条件
        is_force (bool): 是否强制重新判决
    """
    mongo_base = MongodbManager.get_mongo_base(
        mongodb_config=Config.MONGODB_CONFIG)
    coll = mongo_base.get_collection(coll_name="liuli_articles")
    if is_force:
        query = {}
    else:
        query = {"cos_model": {"$exists": False}}

    query.update(basic_filter or {})

    # 查找没有被标记的文章,基于相似度模型进行判断
    for each_data in coll.find(query):
        doc_name = each_data["doc_name"]
        doc_source_name = each_data["doc_source_name"]
        doc_content = each_data["doc_content"]
        doc_keywords = each_data.get("doc_keywords")

        if not doc_keywords:
            keyword_list = extract_keyword_list(doc_content)
            doc_keywords = " ".join(keyword_list)
            each_data["doc_keywords"] = doc_keywords

        # 基于余弦相似度
        cos_model_resp = model_predict_factory(
            model_name="cos",
            model_path="",
            input_dict={
                "text": doc_name + doc_keywords,
                "cos_value": cos_value
            },
            # input_dict={"text": doc_name, "cos_value": Config.COS_VALUE},
        ).to_dict()
        each_data["cos_model"] = cos_model_resp
        if cos_model_resp["result"] == 1:
            LOGGER.info(
                f"[{doc_source_name}] {doc_name} 被识别为广告[{cos_model_resp['probability']}],链接为:{each_data['doc_link']}"
            )
        coll.update_one(
            filter={"doc_id": each_data["doc_id"]},
            update={"$set": each_data},
            upsert=True,
        )
コード例 #4
0
ファイル: all_tasks.py プロジェクト: baboon-king/2c
def update_ads_tag(is_force=False):
    """
    对订阅的文章进行广告标记
    :param is_force: 是否强制重新判决
    :return:
    """
    mongo_base = MongodbManager.get_mongo_base(
        mongodb_config=Config.MONGODB_CONFIG)
    coll = mongo_base.get_collection(coll_name="2c_articles")
    if is_force:
        query = {}
    else:
        query = {"cos_model": {"$exists": False}}

    # 查找没有被标记的文章,基于预先相似度模型进行判断
    for each_data in coll.find(query):
        doc_name = each_data["doc_name"]
        doc_link = each_data["doc_link"]
        doc_source_name = each_data["doc_source_name"]
        doc_keywords = each_data.get("doc_keywords")

        if not doc_keywords:
            keyword_list = fetch_keyword_list(doc_link)
            doc_keywords = " ".join(keyword_list)
            each_data["doc_keywords"] = doc_keywords

        # 基于余弦相似度
        cos_model_resp = model_predict_factory(
            model_name="cos",
            model_path="",
            input_dict={
                "text": doc_name + doc_keywords,
                "cos_value": Config.COS_VALUE
            },
            # input_dict={"text": doc_name, "cos_value": Config.COS_VALUE},
        ).to_dict()
        each_data["cos_model"] = cos_model_resp
        if cos_model_resp["result"] == 1:
            LOGGER.info(
                f"[{doc_source_name}] {doc_name} 被识别为广告[{cos_model_resp['probability']}],链接为:{each_data['doc_link']}"
            )
        coll.update_one(
            filter={"doc_id": each_data["doc_id"]},
            update={"$set": each_data},
            upsert=True,
        )
コード例 #5
0
def run(collect_config: dict):
    """微信公众号文章抓取爬虫

    Args:
        collect_config (dict, optional): 采集器配置
    """
    s_nums = 0
    wechat_list = collect_config["wechat_list"]
    delta_time = collect_config.get("delta_time", 5)

    for name in wechat_list:
        time.sleep(delta_time)
        input_data = asyncio.run(playwright_main(name))
        # 持久化,必须执行
        flag = load_data_to_articlles(input_data)
        if flag:
            s_nums += 1
    msg = f"🤗 微信公众号文章更新完毕({s_nums}/{len(wechat_list)})"
    LOGGER.info(msg)
コード例 #6
0
def run(collect_config: dict):
    """rss解析,rss条目持久化

    Args:
        collect_config (dict, optional): 采集器配置
    """
    feeds_dict: dict = collect_config.get("feeds_dict")
    feeds_name: list = list(feeds_dict)
    delta_time = collect_config.get("delta_time", 1)
    for name in feeds_name:
        LOGGER.info(f"rss源 {name}: {feeds_dict[name]}")
        fd = feedparser.parse(feeds_dict[name])
        for entry in fd.entries:
            LOGGER.info(entry.link)
            # 休眠
            time.sleep(delta_time)
            resp_text = get_html_by_requests(
                url=entry.link, headers={"User-Agent": Config.SPIDER_UA}
            )
            _, doc_core_html = extract_core_html(resp_text)
            doc_core_html_lib = text_compress(doc_core_html)
            input_data = {
                "doc_date": entry.get("published", ""),
                "doc_image": "",
                "doc_name": entry.get("title", ""),
                "doc_ts": int(time.time()),
                "doc_link": entry.get("link", ""),
                "doc_source_meta_list": [],
                "doc_keywords": " ",
                "doc_des": entry.get("description", ""),
                "doc_core_html": doc_core_html_lib,
                "doc_type": "article",
                "doc_author": "",
                "doc_source_name": name,
                "doc_id": md5_encryption(f"{entry.get('title', '')}_{name}"),
                "doc_source": "liuli_feed",
                "doc_source_account_nick": "",
                "doc_source_account_intro": "",
                "doc_content": "",
                "doc_html": "",
            }
            load_data_to_articlles(input_data)
    msg = "🤗 liuli_feed 采集器执行完毕"
    LOGGER.info(msg)
コード例 #7
0
ファイル: action.py プロジェクト: howie6879/2c
def send_doc(sender_conf: dict):
    """
    对文章进行分发
    Args:
        sender_conf (dict): 分发配置
    """
    sender_list = sender_conf["sender_list"]
    query_days = sender_conf.get("query_days", 2)
    delta_time = sender_conf.get("delta_time", 3)
    link_source = sender_conf.get("link_source", "self")
    basic_filter = sender_conf.get("basic_filter", {})
    ignore_doc_source_name = sender_conf.get("ignore_doc_source_name", [])
    skip_ads = sender_conf.get("skip_ads", False)
    if sender_list:
        # 是否启用分发器
        mongo_base = MongodbManager.get_mongo_base(
            mongodb_config=Config.MONGODB_CONFIG)
        coll = mongo_base.get_collection(coll_name="liuli_articles")

        # 分别分发给各个目标
        for send_type in sender_list:
            # 构建查询条件
            cur_ts = int(time.time())
            custom_filter = sender_conf.get("custom_filter",
                                            {}).get(send_type, {})
            query_days = custom_filter.get("query_days", query_days)
            delta_time = custom_filter.get("delta_time", delta_time)
            link_source = custom_filter.get("link_source", link_source)
            skip_ads = custom_filter.get("skip_ads", skip_ads)
            ignore_doc_source_name = custom_filter.get(
                "ignore_doc_source_name", ignore_doc_source_name)
            filter_dict = {
                **basic_filter,
                **{
                    # 时间范围,除第一次外后面其实可以去掉
                    "doc_ts": {
                        "$gte": cur_ts - (query_days * 24 * 60 * 60),
                        "$lte": cur_ts,
                    },
                    # 过滤文档源名称
                    "doc_source_name": {
                        "$nin": ignore_doc_source_name
                    },
                },
            }
            if skip_ads:
                filter_dict.update({
                    # 至少打上一个模型标签
                    "cos_model": {
                        "$exists": True
                    },
                    # 判定结果为非广告
                    "cos_model.result": 1,
                })
            # 查找所有可分发文章
            for each_data in coll.find(filter_dict):
                # 暂时固定,测试
                init_config = sender_conf.get(f"{send_type}_init_config", {})
                cos_model_resp = each_data.get("cos_model", {})
                doc_cus_des = ""
                if cos_model_resp and skip_ads:
                    # 经过模型判断
                    if cos_model_resp["result"] == 1:
                        # 广告标记
                        doc_cus_des = f"👿广告[概率:{cos_model_resp['probability']}]"
                    else:
                        doc_cus_des = "🤓非广告"

                each_data["doc_cus_des"] = doc_cus_des
                each_data["doc_link"] = get_bak_doc_link(
                    link_source=link_source, doc_data=each_data)
                # 每次分发休眠一定时间
                time.sleep(delta_time)
                send_factory(send_type=send_type,
                             init_config=init_config,
                             send_data=each_data)
    else:
        LOGGER.error()("未配置分发器!")
コード例 #8
0
async def playwright_main(wechat_name: str):
    """利用 playwright 获取公众号元信息,输出数据格式见上方
    Args:
        wechat_name ([str]): 公众号名称
    """
    wechat_data = {}
    try:
        async with async_playwright() as p:
            # browser = await p.chromium.launch(headless=False)
            browser = await p.chromium.launch()
            context = await browser.new_context(user_agent=Config.SPIDER_UA)
            page = await context.new_page()
            # 进行公众号检索
            await page.goto("https://weixin.sogou.com/")
            await page.wait_for_load_state()
            await page.click('input[name="query"]')
            await page.fill('input[name="query"]', wechat_name)
            await asyncio.sleep(1)
            await page.click("text=搜公众号")
            await page.wait_for_load_state()
            # await page.pause()
            # 抓取最新文章标题
            sg_html_handle = await page.query_selector("html")
            sg_html = await sg_html_handle.inner_html()
            if sg_html:
                item_list = []
                async for item in SGWechatItem.get_items(html=sg_html):
                    item_list.append(item)

                if item_list:
                    for target_item in item_list:
                        if target_item.wechat_name == wechat_name:
                            # 名字匹配才继续
                            info = f"playwright 匹配公众号 {wechat_name}({target_item.wechat_id}) 成功! 正在提取最新文章: {target_item.latest_title}"
                            LOGGER.info(info)
                            latest_href = target_item.latest_href

                            await page.goto(latest_href)
                            # 等待公众号图片加载出来,整个就算加载完毕
                            try:
                                await page.wait_for_selector(
                                    selector="#js_pc_qr_code_img", timeout=6000
                                )
                            except Exception as _:
                                pass
                            await page.wait_for_load_state()
                            wx_html_handle = await page.query_selector("html")
                            wx_html = await wx_html_handle.inner_html()
                            wechat_item: WechatItem = await WechatItem.get_item(
                                html=wx_html
                            )
                            # 获取当前微信公众号文章地址
                            wechat_item.doc_link = page.url
                            doc_source_name = wechat_item.doc_source_name or wechat_name
                            wechat_data = {
                                **wechat_item.results,
                                **{
                                    "doc_id": md5_encryption(
                                        f"{wechat_item.doc_name}_{doc_source_name}"
                                    ),
                                    "doc_source_name": doc_source_name,
                                    "doc_link": wechat_item.doc_link,
                                    "doc_source": wechat_item.doc_source,
                                    "doc_source_account_nick": wechat_item.doc_source_account_nick,
                                    "doc_source_account_intro": wechat_item.doc_source_account_intro,
                                    "doc_content": html_to_text_h2t(wx_html),
                                    "doc_keywords": "",
                                    "doc_html": "",
                                },
                            }
                            break
                    else:
                        info = f"playwright 匹配公众号 {wechat_name} 失败! "
                        LOGGER.error(info)
            else:
                info = f"playwright 抓取 HTML 失败: {wechat_name} "
                LOGGER.error(info)
            await browser.close()
    except Exception as e:
        info = f"playwright 抓取出错: {wechat_name} str{e}"
        LOGGER.error(info)
    return wechat_data
コード例 #9
0
def backup_doc(backup_config: dict):
    """对文章进行备份

    Args:
        backup_config (dict): 备份配置
    """
    backup_list = backup_config["backup_list"]
    query_days = backup_config.get("query_days", 2)
    delta_time = backup_config.get("delta_time", 3)
    basic_filter = backup_config.get("basic_filter", {})
    doc_html_dict = backup_config.get("doc_html_dict", {})
    init_config = backup_config.get("init_config", {})
    after_get_content = backup_config.get("after_get_content", [])
    if backup_list:
        mongo_base = MongodbManager.get_mongo_base(
            mongodb_config=Config.MONGODB_CONFIG)
        coll = mongo_base.get_collection(coll_name="liuli_articles")
        cur_ts = int(time.time())
        filter_dict = {
            **basic_filter,
            **{
                # 时间范围,除第一次外后面其实可以去掉
                "doc_ts": {
                    "$gte": cur_ts - (query_days * 24 * 60 * 60),
                    "$lte": cur_ts
                }
            },
        }
        db_res = mongodb_find(
            coll_conn=coll,
            filter_dict=filter_dict,
            return_dict={
                "_id": 0,
                "doc_source": 1,
                "doc_source_name": 1,
                "doc_core_html": 1,
                "doc_html": 1,
                "doc_name": 1,
                "doc_link": 1,
            },
        )

        if db_res["status"]:
            # 查找所有可备份文章
            for each_data in db_res["info"]:
                for each in backup_list:
                    # 每次备份休眠一定时间
                    time.sleep(delta_time)
                    backup_ins = backup_factory(backup_type=each,
                                                init_config=init_config)
                    # 获取文档源
                    doc_source = each_data["doc_source"]
                    # 获取最终存储数据
                    doc_html = get_bak_doc_html(
                        doc_data=each_data,
                        doc_html_type=doc_html_dict.get(doc_source, "default"),
                    )
                    # 执行获取文本后的钩子函数
                    for func_dict in after_get_content:
                        cur_func_dict = deepcopy(func_dict)
                        func_name = cur_func_dict.pop("func")
                        LOGGER.info(
                            f"处理器(backup:after_get_content): {func_name} 正在执行..."
                        )
                        cur_func_dict.update({"text": doc_html})
                        doc_html = processor_dict[func_name](**cur_func_dict)
                    # 进行保存动作
                    each_data["doc_html"] = doc_html
                    backup_ins.save(each_data)
        else:
            LOGGER.error(f"Backup 数据查询失败! {db_res['info']}")
    else:
        LOGGER.error("Backup 未配置备份源!")
コード例 #10
0
def run(collect_config: dict):
    """书籍目录抓取爬虫

    Args:
        collect_config (dict, optional): 采集器配置
    """
    book_dict: dict = collect_config["book_dict"]
    delta_time = collect_config.get("delta_time", 5)
    latest_chapter_nums = collect_config.get("latest_chapter_nums", 1)
    for book_name, book_url in book_dict.items():
        resp_text = get_html_by_requests(book_url)
        all_chapters = extract_chapters(chapter_url=book_url, html=resp_text)
        latest_chapter_nums = (latest_chapter_nums
                               if len(all_chapters) > latest_chapter_nums else
                               len(all_chapters))
        latest_chapter_list = all_chapters[-latest_chapter_nums:]
        for latest_chapter in latest_chapter_list:
            doc_name = latest_chapter.get("chapter_name")
            doc_link = latest_chapter.get("chapter_url")
            # 休眠
            time.sleep(delta_time)
            resp_text = get_html_by_requests(
                url=doc_link, headers={"User-Agent": Config.SPIDER_UA})
            _, doc_core_html = extract_core_html(resp_text)
            # 压缩为二进制进行存储
            doc_core_html_lib = text_compress(doc_core_html)
            input_data = {
                "doc_date":
                "",
                "doc_image":
                "",
                "doc_name":
                doc_name,
                "doc_ts":
                int(time.time()),
                "doc_link":
                doc_link,
                "doc_source_meta_list": [],
                "doc_keywords":
                " ".join(extract_keyword_list(html_to_text_h2t(resp_text))),
                "doc_des":
                "",
                "doc_core_html":
                doc_core_html_lib,
                "doc_type":
                "article",
                "doc_author":
                "",
                "doc_source_name":
                book_name,
                "doc_id":
                md5_encryption(f"{doc_name}_{book_name}"),
                "doc_source":
                "liuli_book",
                "doc_source_account_nick":
                "",
                "doc_source_account_intro":
                "",
                "doc_content":
                "",
                "doc_html":
                "",
            }
            # 持久化,必须执行
            load_data_to_articlles(input_data)
    msg = "🤗 liuli_book 采集器执行完毕"
    LOGGER.info(msg)