def test_re_field_get_nothing_with_no_default(): field = RegexField(re_select='nothing to match.') try: field.extract(html=HTML) raise AssertionError except NothingMatchedError: pass
def test_re_field_in_dict_format_with_many(): field = RegexField(re_select='<a class="test_link" href="(?P<href>.*?)">(?P<text>.*?)</a>', many=True) matches = field.extract(html=HTML) assert len(matches) == 5 assert matches[0]['href'] == 'https://github.com/howie6879/' assert matches[0]['text'] == 'hello1 github.' assert matches[4]['href'] == 'https://github.com/howie6879/' assert matches[4]['text'] == 'hello5 github.'
def test_re_field_with_many(): field = RegexField(re_select='<a class="test_link" href="(.*?)">(.*?)</a>', many=True) matches = field.extract(html=HTML) assert len(matches) == 5 href0, text0 = matches[0] href4, text4 = matches[4] assert href0 == 'https://github.com/howie6879/' assert text0 == 'hello1 github.' assert href4 == 'https://github.com/howie6879/' assert text4 == 'hello5 github.'
def test_re_field_with_html_element(): field = RegexField( re_select='<h1><a href="(?P<href>.*?)">(?P<text>.*?)</a></h1>') result = field.extract(html=html_etree) assert result["href"] == "https://github.com" assert result["text"] == "Github"
def test_re_field_get_nothing_with_no_default(): field = RegexField(re_select="nothing to match.") try: field.extract(html=HTML) except Exception as e: assert isinstance(e, NothingMatchedError)
def test_re_field_with_default(): field = RegexField(re_select="nothing to match.", default="default value") result = field.extract(html=HTML) assert result == "default value"
def test_re_field_with_many_groups(): field = RegexField(re_select='<h1><a href="(.*?)">(.*?)</a></h1>') href, text = field.extract(html=HTML) assert href == "https://github.com" assert text == "Github"
def test_re_field_with_no_group(): field = RegexField(re_select="<title>.*?</title>") href = field.extract(html=HTML) assert href == "<title>ruia</title>"
def test_re_field_with_one_group(): field = RegexField(re_select="<title>(.*?)</title>") href = field.extract(html=HTML) assert href == "ruia"
def test_re_field_with_html_element(): field = RegexField( re_select='<h1><a href="(?P<href>.*?)">(?P<text>.*?)</a></h1>') result = field.extract(html=html_etree) assert result['href'] == 'https://github.com' assert result['text'] == 'Github'
class WechatItem(Item): """ 基于 Ruia 的微信页面 Item 提取类 示例:https://mp.weixin.qq.com/s/NKnTiLixjB9h8fSd7Gq8lw """ # 文章标题 # doc_name = AttrField(css_select='meta[property="og:title"]', attr="content") doc_name = AttrField(css_select='meta[property="og:title"]', attr="content", default="") # 描述 doc_des = AttrField(css_select='meta[property="og:description"]', attr="content", default="") # 文章作者 doc_author = AttrField(css_select='meta[property="og:article:author"]', attr="content", default="") # 文章链接,这里的链接有过期时间,但是在微信体系内打开并不会过期,所以可以用 doc_link = AttrField(css_select='meta[property="og:url"]', attr="content", default="") # 文章类型 doc_type = AttrField(css_select='meta[property="og:type"]', attr="content", default="") # 文章发布时间戳 doc_ts = RegexField( re_select=r"var ct = \"(\d{1,10})\"\;", default=time.time(), ) # 文章发布日期 doc_date = RegexField( re_select=r"var ct = \"(\d{1,10})\"\;", default=ts_to_str_date(time.time()), ) # doc_date_f1 = TextField(css_select="em#publish_time", default="") # doc_date_f2 = RegexField( # re_select=r"o=\"(20\d.*)\"\;", # default=ts_to_str_date(time.time(), "%Y-%m-%d %H:%M"), # ) # doc_ts_f1 = TextField(css_select="em#publish_time", default="") # doc_ts_f2 = RegexField( # re_select=r"o=\"(20\d.*)\"\;", # default=ts_to_str_date(time.time(), "%Y-%m-%d %H:%M"), # ) # 文章图 doc_image = AttrField(css_select='meta[property="og:image"]', attr="content", default="") # 公众号名称 doc_source_name = TextField( css_select="div.profile_inner>strong.profile_nickname", default="") # 公众号元数据 doc_source_meta_list = TextField( css_select="p.profile_meta>span.profile_meta_value", many=True, default=["", ""]) # 核心html doc_core_html = HtmlField(css_select="div#js_content", default="") # 公众号昵称 doc_source_account_nick = "" # 公众号介绍 doc_source_account_intro = "" # 文本内容,兼容 doc_content = "" # 常量 # 信息来源 doc_source = "liuli_wechat" async def clean_doc_source_meta_list(self, value: list): """从doc_source_meta_list提取公众号昵称和介绍""" self.doc_source_account_nick = value[0] self.doc_source_account_intro = value[1] return value async def clean_doc_core_html(self, value: str): """清洗核心html""" return text_compress( str(value).strip().replace("visibility: visible;", "").replace( "<br>", "").replace("data-src", "src")) async def clean_doc_date(self, value): """ 清洗时间,数据格式 2021-12-17 08:48 """ try: value = ts_to_str_date(value) except Exception as _: value = ts_to_str_date(time.time()) return value async def clean_doc_ts(self, value): """ 清洗时间戳,数据格式1620567960 """ try: value = int(value) except Exception as _: value = int(time.time()) return value