Beispiel #1
0
def test_re_field_get_nothing_with_no_default():
    field = RegexField(re_select='nothing to match.')
    try:
        field.extract(html=HTML)
        raise AssertionError
    except NothingMatchedError:
        pass
Beispiel #2
0
def test_re_field_in_dict_format_with_many():
    field = RegexField(re_select='<a class="test_link" href="(?P<href>.*?)">(?P<text>.*?)</a>', many=True)
    matches = field.extract(html=HTML)
    assert len(matches) == 5
    assert matches[0]['href'] == 'https://github.com/howie6879/'
    assert matches[0]['text'] == 'hello1 github.'
    assert matches[4]['href'] == 'https://github.com/howie6879/'
    assert matches[4]['text'] == 'hello5 github.'
Beispiel #3
0
def test_re_field_with_many():
    field = RegexField(re_select='<a class="test_link" href="(.*?)">(.*?)</a>', many=True)
    matches = field.extract(html=HTML)
    assert len(matches) == 5
    href0, text0 = matches[0]
    href4, text4 = matches[4]
    assert href0 == 'https://github.com/howie6879/'
    assert text0 == 'hello1 github.'
    assert href4 == 'https://github.com/howie6879/'
    assert text4 == 'hello5 github.'
Beispiel #4
0
def test_re_field_with_html_element():
    field = RegexField(
        re_select='<h1><a href="(?P<href>.*?)">(?P<text>.*?)</a></h1>')
    result = field.extract(html=html_etree)
    assert result["href"] == "https://github.com"
    assert result["text"] == "Github"
Beispiel #5
0
def test_re_field_get_nothing_with_no_default():
    field = RegexField(re_select="nothing to match.")
    try:
        field.extract(html=HTML)
    except Exception as e:
        assert isinstance(e, NothingMatchedError)
Beispiel #6
0
def test_re_field_with_default():
    field = RegexField(re_select="nothing to match.", default="default value")
    result = field.extract(html=HTML)
    assert result == "default value"
Beispiel #7
0
def test_re_field_with_many_groups():
    field = RegexField(re_select='<h1><a href="(.*?)">(.*?)</a></h1>')
    href, text = field.extract(html=HTML)
    assert href == "https://github.com"
    assert text == "Github"
Beispiel #8
0
def test_re_field_with_no_group():
    field = RegexField(re_select="<title>.*?</title>")
    href = field.extract(html=HTML)
    assert href == "<title>ruia</title>"
Beispiel #9
0
def test_re_field_with_one_group():
    field = RegexField(re_select="<title>(.*?)</title>")
    href = field.extract(html=HTML)
    assert href == "ruia"
Beispiel #10
0
def test_re_field_with_html_element():
    field = RegexField(
        re_select='<h1><a href="(?P<href>.*?)">(?P<text>.*?)</a></h1>')
    result = field.extract(html=html_etree)
    assert result['href'] == 'https://github.com'
    assert result['text'] == 'Github'
Beispiel #11
0
class WechatItem(Item):
    """
    基于 Ruia 的微信页面 Item 提取类
    示例:https://mp.weixin.qq.com/s/NKnTiLixjB9h8fSd7Gq8lw
    """

    # 文章标题
    # doc_name = AttrField(css_select='meta[property="og:title"]', attr="content")
    doc_name = AttrField(css_select='meta[property="og:title"]',
                         attr="content",
                         default="")
    # 描述
    doc_des = AttrField(css_select='meta[property="og:description"]',
                        attr="content",
                        default="")
    # 文章作者
    doc_author = AttrField(css_select='meta[property="og:article:author"]',
                           attr="content",
                           default="")
    # 文章链接,这里的链接有过期时间,但是在微信体系内打开并不会过期,所以可以用
    doc_link = AttrField(css_select='meta[property="og:url"]',
                         attr="content",
                         default="")
    # 文章类型
    doc_type = AttrField(css_select='meta[property="og:type"]',
                         attr="content",
                         default="")
    # 文章发布时间戳
    doc_ts = RegexField(
        re_select=r"var ct = \"(\d{1,10})\"\;",
        default=time.time(),
    )
    # 文章发布日期
    doc_date = RegexField(
        re_select=r"var ct = \"(\d{1,10})\"\;",
        default=ts_to_str_date(time.time()),
    )
    # doc_date_f1 = TextField(css_select="em#publish_time", default="")
    # doc_date_f2 = RegexField(
    #     re_select=r"o=\"(20\d.*)\"\;",
    #     default=ts_to_str_date(time.time(), "%Y-%m-%d %H:%M"),
    # )
    # doc_ts_f1 = TextField(css_select="em#publish_time", default="")
    # doc_ts_f2 = RegexField(
    #     re_select=r"o=\"(20\d.*)\"\;",
    #     default=ts_to_str_date(time.time(), "%Y-%m-%d %H:%M"),
    # )
    # 文章图
    doc_image = AttrField(css_select='meta[property="og:image"]',
                          attr="content",
                          default="")
    # 公众号名称
    doc_source_name = TextField(
        css_select="div.profile_inner>strong.profile_nickname", default="")
    # 公众号元数据
    doc_source_meta_list = TextField(
        css_select="p.profile_meta>span.profile_meta_value",
        many=True,
        default=["", ""])
    # 核心html
    doc_core_html = HtmlField(css_select="div#js_content", default="")
    # 公众号昵称
    doc_source_account_nick = ""
    # 公众号介绍
    doc_source_account_intro = ""
    # 文本内容,兼容
    doc_content = ""
    # 常量
    # 信息来源
    doc_source = "liuli_wechat"

    async def clean_doc_source_meta_list(self, value: list):
        """从doc_source_meta_list提取公众号昵称和介绍"""
        self.doc_source_account_nick = value[0]
        self.doc_source_account_intro = value[1]
        return value

    async def clean_doc_core_html(self, value: str):
        """清洗核心html"""

        return text_compress(
            str(value).strip().replace("visibility: visible;", "").replace(
                "<br>", "").replace("data-src", "src"))

    async def clean_doc_date(self, value):
        """
        清洗时间,数据格式 2021-12-17 08:48
        """
        try:
            value = ts_to_str_date(value)
        except Exception as _:
            value = ts_to_str_date(time.time())
        return value

    async def clean_doc_ts(self, value):
        """
        清洗时间戳,数据格式1620567960
        """
        try:
            value = int(value)
        except Exception as _:
            value = int(time.time())
        return value