def test_attr_field_with_default_and_many(): field = AttrField(css_select="div.brand b", attr="href", default="nothing", many=True) values = field.extract(html_etree=html_etree) assert values == ["nothing"]
class UKSearchItem(Item): target_item = TextField( css_select= "div[data-spm=PhoneSokuThreeProgram_4] > div.pack-cover_1K0Xq") title = AttrField(css_select='a.pack-top_2nSnm', attr='data-trackinfo') url = AttrField(css_select='a.pack-top_2nSnm', attr='href') img = AttrField(css_select='a.pack-top_2nSnm', attr='style')
class QidianNovelsItem(Item): target_item = TextField(css_select='ul.all-img-list>li') novel_url = AttrField(css_select='div.book-img-box>a', attr='href') novel_name = TextField(css_select='div.book-mid-info>h4') novel_author = TextField(css_select='div.book-mid-info>p.author>a.name') novel_author_home_url = AttrField( css_select='div.book-mid-info>p.author>a.name', attr='href') novel_type = TextField( css_select='div.book-mid-info > p.author > a:nth-child(4)') novel_cover = AttrField(css_select='div.book-img-box img', attr='src') novel_abstract = TextField(css_select='div.book-mid-info p.intro') # novel_latest_chapter = TextField(css_select='div.bookupdate a') async def clean_novel_url(self, novel_url): return 'https:' + novel_url async def clean_novel_author(self, novel_author): if isinstance(novel_author, list): novel_author = novel_author[0].text return novel_author async def clean_novel_author_home_url(self, novel_author_home_url): if isinstance(novel_author_home_url, list): novel_author_home_url = novel_author_home_url[0].get( 'href').strip() return 'https:' + novel_author_home_url async def clean_novel_cover(self, novel_cover): return 'https:' + novel_cover
class RankingItem(Item): target_item = TextField(css_select='.rank-list') ranking_title = AttrField(css_select='h3.wrap-title', attr='html') more = AttrField(css_select='h3>a.more', attr='href') book_list = TextField(css_select='div.book-list>ul>li') async def clean_ranking_title(self, ranking_title): if isinstance(ranking_title, list): return ranking_title[0].text async def clean_more(self, more): return "http:" + more
class HYNovelInfoItem(Item): """ 定义继承自item的Item类 """ novel_name = AttrField(css_select="meta[property='og:title']", attr='content') author = AttrField(css_select="meta[property='og:novel:author']", attr='content') cover = AttrField(css_select="meta[property='og:image']", attr='content') abstract = AttrField(css_select="meta[property='og:description']", attr='content') status = AttrField(css_select="meta[property='og:novel:status']", attr='content') novels_type = AttrField(css_select="meta[property='og:novel:category']", attr='content') novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href') latest_chapter = AttrField( css_select="meta[property='og:novel:latest_chapter_name']", attr='content') latest_chapter_url = AttrField( css_select="meta[property='og:novel:latest_chapter_url']", attr='content') latest_chapter_time = AttrField( css_select="meta[property='og:novel:update_time']", attr='content') # novel_name = TextField(css_select='div.c-left>div.mod>div.hd>h2') # author = TextField(css_select='div.author-zone div.right a.name strong') # cover = AttrField(css_select='img.book-cover', attr='src') # abstract = TextField(css_select='pre.note') # status = '' # novels_type = TextField(css_select='div.c-left>div.mod>div.hd>p.infos>span.cate>a') # latest_chapter = '' # novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href') async def clean_cover(self, cover): if 'https' in cover: return cover else: return cover.replace('http', 'https') async def clean_novels_type(self, novels_type): types_dict = {'社会': '都市'} print(types_dict.get(str(novels_type).strip(), novels_type)) return types_dict.get(str(novels_type).strip(), novels_type) async def clean_latest_chapter_time(self, latest_chapter_time): return latest_chapter_time.replace( u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace( u'昨日', str( time.strftime("%Y-%m-%d ", time.localtime(time.time() - 24 * 60 * 60))))
class ArticleListItem(Item): """ eg: http://www.ruanyifeng.com/blog/essays/ """ target_item = TextField(css_select='div#alpha-inner li.module-list-item') title = TextField(css_select='li.module-list-item>a') href = AttrField(css_select='li.module-list-item>a', attr='href')
class FictionItem(Item): target_item = TextField(css_select="dd") title = TextField(css_select='a') url = AttrField(css_select='a', attr='href') async def clean_title(self, value): return value
class QidianNovelInfoItem(Item): """ 定义继承自item的Item类 """ novel_name = TextField(css_select='.book-info>h1>em') author = TextField(css_select='a.writer') # 当提取的值是属性的时候,要定义AttrField cover = AttrField(css_select='a#bookImg>img', attr='src') abstract = TextField(css_select='div.book-intro>p') status = TextField(css_select='p.tag>span.blue') novels_type = TextField(css_select='p.tag>a.red') latest_chapter = TextField(css_select='li.update>div.detail>p.cf>a') latest_chapter_time = TextField(css_select='div.detail>p.cf>em') async def clean_cover(self, cover): return 'http:' + cover async def clean_status(self, status): """ 当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取 :param ele_tag: :return: """ return '#'.join([i.text for i in status]) async def clean_novels_type(self, novels_type): return '#'.join([i.text for i in novels_type]) async def clean_latest_chapter_time(self, latest_chapter_time): return latest_chapter_time.replace( u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace( u'昨日', str( time.strftime("%Y-%m-%d ", time.localtime(time.time() - 24 * 60 * 60))))
class JianshuItem(Item): target_item = TextField(css_select="ul.list>li") author_name = TextField(css_select="a.name") author_url = AttrField(attr="href", css_select="a.name") async def clean_author_url(self, author_url): return f"https://www.jianshu.com{author_url}"
class HackerNewsItem(Item): target_item = TextField(css_select='tr.athing') title = TextField(css_select='a.storylink') url = AttrField(css_select='a.storylink', attr='href') async def clean_title(self, value): return value
class HackerNewsItem(Item): target_item = TextField(css_select="tr.athing") title = TextField(css_select="a.storylink") url = AttrField(css_select="a.storylink", attr="href") async def clean_title(self, value): return value.strip()
class ArchivesItem(Item): """ eg: http://www.ruanyifeng.com/blog/archives.html """ target_item = TextField(css_select="div#beta-inner li.module-list-item") href = AttrField(css_select="li.module-list-item>a", attr="href")
class FishItem(Item): target_item = TextField(css_select='div.m_search_list dl') title = TextField(css_select='h2 a') date = TextField(css_select='dd.search_laiyuan') url = AttrField(css_select='h2 a', attr='href') async def clean_date(self, value): date = value.replace('发布时间:', '') return date
class Data258WechatItem(Item): """ 微阅读公众号搜索一级页面信息提取 示例:https://mp.data258.com/mp/search?type=category&key=老胡的储物柜&sort= """ target_item = TextField(css_select="div.layui-panel") wechat_name = TextField(css_select="h2>a", default="") wehcat_href = AttrField(css_select="h2>a", attr="href", default="")
class DiseaseItem(Item): disease_name = TextField(css_select='div.keshi_list>a', many=True) disease_url = AttrField(css_select='div.keshi_list>a', attr='href', many=True) async def clean_disease_url(self, disease_url): domain = 'https://m.120ask.com' return [urljoin(domain, i) for i in disease_url]
class FishItem(Item): target_item = TextField(css_select='td [width="530"]') title = TextField(css_select='a') date = TextField(css_select='td') url = AttrField(css_select='a', attr='href') async def clean_date(self, value): date = value[-10:] date = date.replace('.', '-') return date
class FishItem(Item): target_item = TextField(css_select='#info ') title = TextField(css_select='h2 a') date = TextField(css_select='dd.search_laiyuan') url = AttrField(css_select='h2 a', attr='href') async def clean_date(self, value): date = value.split('(')[1] date = date.rstrip(')') print('date =', date) return date
class DoubanItems(Item): target_item = TextField(css_select='div.item') title = TextField(css_select='span.title') cover = AttrField(css_select='div.pic>a>img', attr='src') abstract = TextField(css_select='span.inq') async def clean_title(self, title): if isinstance(title, str): return title else: return ''.join([i.text.strip().replace('\xa0', '') for i in title])
class DoubanItem(Item): target_item = TextField(css_select="div.item") title = TextField(css_select="span.title") cover = AttrField(css_select="div.pic>a>img", attr="src") abstract = TextField(css_select="span.inq", default="") async def clean_title(self, title): if isinstance(title, str): return title else: return "".join([i.text.strip().replace("\xa0", "") for i in title])
class DiseaseHomeItem(Item): disease_name = TextField(css_select='h1.ti') disease_subject = TextField(css_select='div.table>div:nth-child(1)>span') disease_ask_lists = TextField(css_select='div.ask_lists div.lists a', many=True) disease_ask_link_lists = AttrField(css_select='div.ask_lists div.lists a', attr='href', many=True) async def clean_disease_ask_link_lists(self, disease_ask_link_lists): return ['http:' + i for i in disease_ask_link_lists]
class ZHNovelsItem(Item): target_item = TextField(css_select='div.store_collist div.bookbox') novel_url = AttrField(css_select='div.bookinfo div.bookname a', attr='href') novel_title = TextField(css_select='div.bookinfo div.bookname a') novel_author = TextField(css_select='div.bookilnk a:nth-child(1)') novel_type = TextField(css_select='div.bookilnk a:nth-child(2)') novel_cover = AttrField(css_select='div.bookimg img', attr='src') novel_abstract = TextField(css_select='div.bookintro') novel_lastest_update = TextField(css_select='div.bookupdate a') async def clean_novel_url(self, novel_url): return novel_url.replace('/book/', '/showchapter/') async def clean_novel_author(self, novel_author): if novel_author: if isinstance(novel_author, list): novel_author = novel_author[0].text return novel_author else: return '' async def clean_novel_url(self, novel_abstract): return novel_abstract.replace('\\r', '').replace('\\n', '').replace(r'\u3000', '') # def tal_novel_author_home_url(self, novel_author_home_url): # if isinstance(novel_author_home_url, list): # novel_author_home_url = novel_author_home_url[0].get('href').strip() # return 'http:' + novel_author_home_url async def save(self, res_dic): # 存进es try: await self.es.Index_Data(res_dic) #self.logger.info("插入成功") return True except Exception as e: self.logger.exception(e) return False
class ZHNovelInfoItem(Item): """ 定义继承自item的Item类 """ novel_name = TextField(css_select='div.main div.status h1 a') author = TextField(css_select='div.main div.status div.booksub a') # 当提取的值是属性的时候,要定义AttrField cover = AttrField(css_select='div.main div.book_cover img', attr='src') abstract = TextField(css_select='div.main div.status div.info_con p') status = AttrField(css_select='div.main div.status h1 em', attr='title') novels_type = TextField(css_select='div.main div.status div.booksub a') novel_chapter_url = AttrField( css_select='div.main div.status div.book_btn span.list a', attr='href') async def clean_author(self, author): if isinstance(author, list): return author[0].text else: return author async def clean_status(self, status): """ 当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取 :param ele_tag: :return: """ if isinstance(status, list): return '#'.join( [i.get('title').strip().replace('作品', '') for i in status]) else: return status async def clean_novels_type(self, novels_type): if isinstance(novels_type, list): try: return novels_type[1].text except: return '' else: return ''
class ZHNovelsItem(Item): target_item = TextField(css_select='div.store_collist div.bookbox') novel_url = AttrField(css_select='div.bookinfo div.bookname a', attr='href') novel_name = TextField(css_select='div.bookinfo div.bookname a') novel_author = TextField(css_select='div.bookilnk a:nth-child(1)') novel_author_home_url = AttrField(css_select='div.bookilnk a:nth-child(1)', attr='href') novel_type = TextField(css_select='div.bookilnk a:nth-child(2)') novel_cover = AttrField(css_select='div.bookimg img', attr='src') novel_abstract = TextField(css_select='div.bookintro') novel_latest_chapter = TextField(css_select='div.bookupdate a') # def tal_novel_url(self, novel_url): # return 'http:' + novel_url async def clean_novel_author(self, novel_author): if novel_author: if isinstance(novel_author, list): novel_author = novel_author[0].text return novel_author else: return ''
class QidianNovelsItem(Item): target_item = TextField(css_select='div.book-img-text>ul>li') novel_url = AttrField(css_select='div.book-img-box>a', attr='href') novel_title = TextField(css_select='div.book-mid-info>h4') novel_author = TextField(css_select='div.book-mid-info>p.author>a.name') novel_type = TextField( css_select='div.book-mid-info > p.author > a:nth-child(4)') novel_status = TextField(css_select='div.book-mid-info>p.author>span') novel_cover = AttrField(css_select='div.book-img-box img', attr='src') novel_abstract = TextField(css_select='div.book-mid-info p.intro') novel_lastest_update = TextField(css_select='div.book-mid-info > p.update') async def clean_novel_url(self, novel_url): return 'https:' + novel_url async def clean_novel_author(self, novel_author): if isinstance(novel_author, list): novel_author = novel_author[0].text return novel_author async def clean_novel_cover(self, novel_cover): return 'https:' + novel_cover
class RankingItem(Item): target_item = TextField(css_select='.rank-list') ranking_title = TextField(css_select='h3.wrap-title') more = AttrField(css_select='h3>a.more', attr='href') book_list = HtmlField(css_select='div.book-list>ul>li', many=True) async def clean_ranking_title(self, ranking_title): if isinstance(ranking_title, list): return ranking_title[0].text else: return str(ranking_title).split('榜')[0] + '榜' async def clean_more(self, more): return "https:" + more
class Data258WechatListItem(Item): """ 微阅读公众号历史文章信息提取 示例: https://mp.data258.com/article/category/howie_locker """ target_item = TextField(css_select="ul.jie-row>li") w_article_title = TextField(css_select="a.jie-title", default="") w_article_href = AttrField(css_select="a.jie-title", attr="href", default="") async def clean_w_article_title(self, value: list): """获取文章标题""" return str(value).strip() if value else ""
class HackerNewsItem(Item): """ 定义目标字段抓取规则 """ target_item = TextField(css_select='tr.athing') title = TextField(css_select='a.storylink') url = AttrField(css_select='a.storylink', attr='href') async def clean_title(self, value): """ 清洗目标数据 :param value: 初始目标数据 :return: """ return str(value).strip()
class FishItem(Item): target_item = TextField(css_select='div.jsearch-result-box') title = TextField(css_select='div.jsearch-result-title') date = TextField(css_select='span.jsearch-result-date') url = AttrField(css_select='div.jsearch-result-title a', attr='href') async def clean_date(self, value): date = value.rstrip('-') date = date.rstrip(' ') date = date.rstrip('日') date = date.replace('年', '-').replace('月', '-') return date async def clean_url(self, value): url = 'http://www.cast.org.cn' + value return url
class SGWechatItem(Item): """ 搜索搜狗微信公众号页面信息提取类,一般是只会有一个结果 示例:https://weixin.sogou.com/weixin?query=老胡的储物柜 """ # 默认此页面是多行内容列表 target_item = TextField(css_select="div.news-box>ul>li") wechat_name = TextField(css_select="p.tit>a", default="") wechat_id = TextField(css_select='label[name="em_weixinhao"]', default="") latest_title = TextField(css_select='dd>a[target="_blank"]', default="暂无更新") latest_href = AttrField(css_select='dd>a[target="_blank"]', attr="href", default="") async def clean_wechat_name(self, wechat_name: str) -> str: """ 清洗 wechat_name """ return str(wechat_name).replace("\n", "").replace(" ", "").strip() async def clean_wechat_id(self, wechat_id: str) -> str: """ 清洗 wechat_id """ return str(wechat_id).strip() async def clean_latest_title(self, latest_title: str) -> str: """ 清洗 latest_title """ return str(latest_title).replace("\n", "").replace(" ", "").strip() async def clean_latest_href(self, latest_href: str) -> str: """ 清洗 latest_href """ f_url = "" if latest_href: f_url = f"https://weixin.sogou.com/{latest_href}" return f_url
def refresh_user_ns(self, request: Request, response: Response): """ Refresh the user namespace :param request: ruia.Request :param response: ruia.Response :return: """ import ruia self.user_ns["asyncio"] = asyncio self.user_ns["ruia"] = ruia self.user_ns["request"] = request self.user_ns["response"] = response self.user_ns["fetch"] = self.fetch self.user_ns[ "attr_field"] = lambda etree=response.etree, **kwargs: AttrField( **kwargs).extract(etree) self.user_ns[ "text_field"] = lambda etree=response.etree, **kwargs: TextField( **kwargs).extract(etree)