class MyItem(Item): """ 定义爬虫的目标字段 """ target_item = HtmlField(css_select='html') title = TextField(css_select='head title') article = HtmlField(css_select='article')
def test_html_field_with_many(): field = HtmlField(css_select="a.test_link", many=True) values = field.extract(html_etree=html_etree) assert len(values) == 5 assert values[0] == '<a class="test_link" href="https://github.com/howie6879/">hello1 github.</a>\n' assert values[4] == '<a class="test_link" href="https://github.com/howie6879/">hello5 github.</a>\n' \ ' Some text outside.\n'
def test_html_field(): field_en = HtmlField(css_select="div.brand a") field_zh = HtmlField(css_select="div.brand p") assert ( field_en.extract(html_etree=html_etree) == '<a href="https://github.com">Github</a>' ) assert field_zh.extract(html_etree=html_etree) == "<p>你好</p>\n"
class RankingItem(Item): target_item = TextField(css_select='.rank-list') ranking_title = TextField(css_select='h3.wrap-title') more = AttrField(css_select='h3>a.more', attr='href') book_list = HtmlField(css_select='div.book-list>ul>li', many=True) async def clean_ranking_title(self, ranking_title): if isinstance(ranking_title, list): return ranking_title[0].text else: return str(ranking_title).split('榜')[0] + '榜' async def clean_more(self, more): return "https:" + more
class ChinaNewsItem(Item): """ 定义目标字段抓取规则 """ #target_item = title = TextField(css_select='h1') content = HtmlField(css_select='div.left_zw') async def clean_title(self, value): """ 清洗目标数据 :param value: 初始目标数据 :return: """ return value
def test_html_field(): field = HtmlField(css_select="div.brand a") assert (field.extract( html_etree=html_etree) == '<a href="https://github.com">Github</a>')
class RankingItem(Item): target_item = TextField(css_select='div.rank_i_p_list') ranking_title = TextField(css_select='div.rank_i_p_tit') more = AttrField(css_select='div.rank_i_more a', attr='href') book_list = HtmlField(css_select='div.rank_i_p_list>div.rank_i_li', many=True)
class WechatItem(Item): """ 基于 Ruia 的微信页面 Item 提取类 示例:https://mp.weixin.qq.com/s/NKnTiLixjB9h8fSd7Gq8lw """ # 文章标题 # doc_name = AttrField(css_select='meta[property="og:title"]', attr="content") doc_name = AttrField(css_select='meta[property="og:title"]', attr="content", default="") # 描述 doc_des = AttrField(css_select='meta[property="og:description"]', attr="content", default="") # 文章作者 doc_author = AttrField(css_select='meta[property="og:article:author"]', attr="content", default="") # 文章链接,这里的链接有过期时间,但是在微信体系内打开并不会过期,所以可以用 doc_link = AttrField(css_select='meta[property="og:url"]', attr="content", default="") # 文章类型 doc_type = AttrField(css_select='meta[property="og:type"]', attr="content", default="") # 文章发布时间戳 doc_ts = RegexField( re_select=r"var ct = \"(\d{1,10})\"\;", default=time.time(), ) # 文章发布日期 doc_date = RegexField( re_select=r"var ct = \"(\d{1,10})\"\;", default=ts_to_str_date(time.time()), ) # doc_date_f1 = TextField(css_select="em#publish_time", default="") # doc_date_f2 = RegexField( # re_select=r"o=\"(20\d.*)\"\;", # default=ts_to_str_date(time.time(), "%Y-%m-%d %H:%M"), # ) # doc_ts_f1 = TextField(css_select="em#publish_time", default="") # doc_ts_f2 = RegexField( # re_select=r"o=\"(20\d.*)\"\;", # default=ts_to_str_date(time.time(), "%Y-%m-%d %H:%M"), # ) # 文章图 doc_image = AttrField(css_select='meta[property="og:image"]', attr="content", default="") # 公众号名称 doc_source_name = TextField( css_select="div.profile_inner>strong.profile_nickname", default="") # 公众号元数据 doc_source_meta_list = TextField( css_select="p.profile_meta>span.profile_meta_value", many=True, default=["", ""]) # 核心html doc_core_html = HtmlField(css_select="div#js_content", default="") # 公众号昵称 doc_source_account_nick = "" # 公众号介绍 doc_source_account_intro = "" # 文本内容,兼容 doc_content = "" # 常量 # 信息来源 doc_source = "liuli_wechat" async def clean_doc_source_meta_list(self, value: list): """从doc_source_meta_list提取公众号昵称和介绍""" self.doc_source_account_nick = value[0] self.doc_source_account_intro = value[1] return value async def clean_doc_core_html(self, value: str): """清洗核心html""" return text_compress( str(value).strip().replace("visibility: visible;", "").replace( "<br>", "").replace("data-src", "src")) async def clean_doc_date(self, value): """ 清洗时间,数据格式 2021-12-17 08:48 """ try: value = ts_to_str_date(value) except Exception as _: value = ts_to_str_date(time.time()) return value async def clean_doc_ts(self, value): """ 清洗时间戳,数据格式1620567960 """ try: value = int(value) except Exception as _: value = int(time.time()) return value
class HackerNewsItem(Item): target_item = TextField(css_select='tr.athing') title = TextField(css_select='a.storylink') url = AttrField(css_select='a.storylink', attr='href') content = HtmlField(css_select='a.storylink')