class QidianNovelInfoItem(Item): """ 定义继承自item的Item类 """ novel_name = TextField(css_select='.book-info>h1>em') author = TextField(css_select='a.writer') # 当提取的值是属性的时候,要定义AttrField cover = AttrField(css_select='a#bookImg>img', attr='src') abstract = TextField(css_select='div.book-intro>p') status = TextField(css_select='p.tag>span.blue') novels_type = TextField(css_select='p.tag>a.red') latest_chapter = TextField(css_select='li.update>div.detail>p.cf>a') latest_chapter_time = TextField(css_select='div.detail>p.cf>em') def tal_cover(self, cover): return 'http:' + cover def tal_status(self, status): """ 当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取 :param ele_tag: :return: """ return '#'.join([i.text for i in status]) def tal_novels_type(self, novels_type): return '#'.join([i.text for i in novels_type]) def tal_latest_chapter_time(self, latest_chapter_time): return latest_chapter_time.replace( u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace( u'昨日', str( time.strftime("%Y-%m-%d ", time.localtime(time.time() - 24 * 60 * 60))))
class QidianItem(Item): """ 定义继承自item的Item类 """ title = TextField(css_select='.book-info>h1>em') author = TextField(css_select='a.writer') # 当提取的值是属性的时候,要定义AttrField cover = AttrField(css_select='a#bookImg>img', attr='src') abstract = TextField(css_select='div.book-intro>p') tag = TextField(css_select='span.blue') latest_chapter = TextField(css_select='li.update>div.detail>p.cf>a') latest_chapter_time = TextField(css_select='div.detail>p.cf>em') def tal_title(self, title): """ 这里可以二次对获取的目标值进行处理,比如替换、清洗等 :param title: :return: """ return title def tal_cover(self, cover): return 'http:' + cover def tal_tag(self, ele_tag): """ 当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取 :param ele_tag: :return: """ return '#'.join([i.text for i in ele_tag]) def tal_latest_chapter_time(self, latest_chapter_time): return latest_chapter_time.replace(u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace(u'昨日', str( time.strftime("%Y-%m-%d ", time.localtime(time.time() - 24 * 60 * 60))))
class RankingItem(Item): target_item = TextField(css_select='.rank-list') ranking_title = TextField(css_select='h3.wrap-title') more = AttrField(css_select='h3>a.more', attr='href') book_list = TextField(css_select='div.book-list>ul>li') def tal_more(self, more): return "http:" + more
class BaiduImgItem(Item): """ 定义继承自item的Item类 """ img_url = AttrField(css_select='img.BDE_Image', attr='src') def tal_img_url(self, ele_img_url): return [i.get('src').strip() for i in ele_img_url]
class ZHNovelsItem(Item): target_item = TextField(css_select='ul.main_con>li') novel_url = AttrField(css_select='span.chap>a.fs14', attr='href') novel_name = TextField(css_select='span.chap>a.fs14') novel_author = TextField(css_select='span.author>a') novel_author_home_url = AttrField(css_select='span.author>a', attr='href') # def tal_novel_url(self, novel_url): # return 'http:' + novel_url def tal_novel_author(self, novel_author): if novel_author: if isinstance(novel_author, list): novel_author = novel_author[0].text return novel_author else: return ''
class DoubanSpider(Item): target_item = TextField(css_select='div.item') title = TextField(css_select='span.title') cover = AttrField(css_select='div.pic>a>img', attr='src') abstract = TextField(css_select='span.inq') def tal_title(self, title): if isinstance(title, str): return title else: return ''.join([i.text.strip().replace('\xa0', '') for i in title])
class QidianNovelsItem(Item): target_item = TextField(css_select='ul.all-img-list>li') novel_url = AttrField(css_select='div.book-img-box>a', attr='href') novel_name = TextField(css_select='div.book-mid-info>h4') novel_author = TextField(css_select='div.book-mid-info>p.author>a.name') novel_author_home_url = AttrField( css_select='div.book-mid-info>p.author>a.name', attr='href') def tal_novel_url(self, novel_url): return 'http:' + novel_url def tal_novel_author(self, novel_author): if isinstance(novel_author, list): novel_author = novel_author[0].text return novel_author def tal_novel_author_home_url(self, novel_author_home_url): if isinstance(novel_author_home_url, list): novel_author_home_url = novel_author_home_url[0].get( 'href').strip() return 'http:' + novel_author_home_url
class ZHNovelInfoItem(Item): """ 定义继承自item的Item类 """ novel_name = TextField(css_select='div.main div.status h1 a') author = TextField(css_select='div.main div.status div.booksub a') # 当提取的值是属性的时候,要定义AttrField cover = AttrField(css_select='div.main div.book_cover img', attr='src') abstract = TextField(css_select='div.main div.status div.info_con p') status = AttrField(css_select='div.main div.status h1 em', attr='title') novels_type = TextField(css_select='div.main div.status div.booksub a') novel_chapter_url = AttrField(css_select='div.main div.status div.book_btn span.list a', attr='href') def tal_author(self, author): if isinstance(author, list): return author[0].text else: return author def tal_status(self, status): """ 当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取 :param ele_tag: :return: """ if isinstance(status, list): return '#'.join([i.get('title').strip().replace('作品', '') for i in status]) else: return status def tal_novels_type(self, novels_type): if isinstance(novels_type, list): try: return novels_type[1].text except: return '' else: return ''
class TestSpider(Item): title = TextField(css_select='.book-info>h1>em') author = TextField(css_select='a.writer') cover = AttrField(css_select='a#bookImg>img', attr='src') abstract = TextField(css_select='div.book-intro>p') tag = TextField(css_select='span.blue') latest_chapter = TextField(css_select='div.detail>p.cf>a') latest_chapter_time = TextField(css_select='div.detail>p.cf>em') def tal_title(self, title): # Clean your target value return title def tal_cover(self, cover): return 'http:' + cover def tal_tag(self, ele_tag): return '#'.join([i.text for i in ele_tag]) def tal_latest_chapter_time(self, latest_chapter_time): return latest_chapter_time.replace('今天', str(time.strftime("%Y-%m-%d ", time.localtime())))
def test_attr_field(self): attr_field = AttrField(css_select="p a.test_link", attr='href') value = attr_field.extract_value(self.html) self.assertEqual(value, "https://github.com/howie6879/talonspider")