Beispiel #1
0
class QidianNovelInfoItem(Item):
    """
    定义继承自item的Item类
    """
    novel_name = TextField(css_select='.book-info>h1>em')
    author = TextField(css_select='a.writer')
    # 当提取的值是属性的时候,要定义AttrField
    cover = AttrField(css_select='a#bookImg>img', attr='src')
    abstract = TextField(css_select='div.book-intro>p')
    status = TextField(css_select='p.tag>span.blue')
    novels_type = TextField(css_select='p.tag>a.red')
    latest_chapter = TextField(css_select='li.update>div.detail>p.cf>a')
    latest_chapter_time = TextField(css_select='div.detail>p.cf>em')

    def tal_cover(self, cover):
        return 'http:' + cover

    def tal_status(self, status):
        """
        当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取
        :param ele_tag:
        :return:
        """
        return '#'.join([i.text for i in status])

    def tal_novels_type(self, novels_type):
        return '#'.join([i.text for i in novels_type])

    def tal_latest_chapter_time(self, latest_chapter_time):
        return latest_chapter_time.replace(
            u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace(
                u'昨日',
                str(
                    time.strftime("%Y-%m-%d ",
                                  time.localtime(time.time() - 24 * 60 * 60))))
class QidianItem(Item):
    """
    定义继承自item的Item类
    """
    title = TextField(css_select='.book-info>h1>em')
    author = TextField(css_select='a.writer')
    # 当提取的值是属性的时候,要定义AttrField
    cover = AttrField(css_select='a#bookImg>img', attr='src')
    abstract = TextField(css_select='div.book-intro>p')
    tag = TextField(css_select='span.blue')
    latest_chapter = TextField(css_select='li.update>div.detail>p.cf>a')
    latest_chapter_time = TextField(css_select='div.detail>p.cf>em')

    def tal_title(self, title):
        """
        这里可以二次对获取的目标值进行处理,比如替换、清洗等
        :param title:
        :return:
        """
        return title

    def tal_cover(self, cover):
        return 'http:' + cover

    def tal_tag(self, ele_tag):
        """
        当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取
        :param ele_tag:
        :return:
        """
        return '#'.join([i.text for i in ele_tag])

    def tal_latest_chapter_time(self, latest_chapter_time):
        return latest_chapter_time.replace(u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace(u'昨日', str(
            time.strftime("%Y-%m-%d ", time.localtime(time.time() - 24 * 60 * 60))))
Beispiel #3
0
class RankingItem(Item):
    target_item = TextField(css_select='.rank-list')
    ranking_title = TextField(css_select='h3.wrap-title')
    more = AttrField(css_select='h3>a.more', attr='href')
    book_list = TextField(css_select='div.book-list>ul>li')

    def tal_more(self, more):
        return "http:" + more
Beispiel #4
0
class BaiduImgItem(Item):
    """
    定义继承自item的Item类
    """
    img_url = AttrField(css_select='img.BDE_Image', attr='src')

    def tal_img_url(self, ele_img_url):
        return [i.get('src').strip() for i in ele_img_url]
class ZHNovelsItem(Item):
    target_item = TextField(css_select='ul.main_con>li')
    novel_url = AttrField(css_select='span.chap>a.fs14', attr='href')
    novel_name = TextField(css_select='span.chap>a.fs14')
    novel_author = TextField(css_select='span.author>a')
    novel_author_home_url = AttrField(css_select='span.author>a', attr='href')

    # def tal_novel_url(self, novel_url):
    # return 'http:' + novel_url

    def tal_novel_author(self, novel_author):
        if novel_author:
            if isinstance(novel_author, list):
                novel_author = novel_author[0].text
            return novel_author
        else:
            return ''
Beispiel #6
0
class DoubanSpider(Item):
    target_item = TextField(css_select='div.item')
    title = TextField(css_select='span.title')
    cover = AttrField(css_select='div.pic>a>img', attr='src')
    abstract = TextField(css_select='span.inq')

    def tal_title(self, title):
        if isinstance(title, str):
            return title
        else:
            return ''.join([i.text.strip().replace('\xa0', '') for i in title])
Beispiel #7
0
class QidianNovelsItem(Item):
    target_item = TextField(css_select='ul.all-img-list>li')
    novel_url = AttrField(css_select='div.book-img-box>a', attr='href')
    novel_name = TextField(css_select='div.book-mid-info>h4')
    novel_author = TextField(css_select='div.book-mid-info>p.author>a.name')
    novel_author_home_url = AttrField(
        css_select='div.book-mid-info>p.author>a.name', attr='href')

    def tal_novel_url(self, novel_url):
        return 'http:' + novel_url

    def tal_novel_author(self, novel_author):
        if isinstance(novel_author, list):
            novel_author = novel_author[0].text
        return novel_author

    def tal_novel_author_home_url(self, novel_author_home_url):
        if isinstance(novel_author_home_url, list):
            novel_author_home_url = novel_author_home_url[0].get(
                'href').strip()
        return 'http:' + novel_author_home_url
Beispiel #8
0
class ZHNovelInfoItem(Item):
    """
    定义继承自item的Item类
    """
    novel_name = TextField(css_select='div.main div.status h1 a')
    author = TextField(css_select='div.main div.status div.booksub a')
    # 当提取的值是属性的时候,要定义AttrField
    cover = AttrField(css_select='div.main div.book_cover img', attr='src')
    abstract = TextField(css_select='div.main div.status div.info_con p')
    status = AttrField(css_select='div.main div.status h1 em', attr='title')
    novels_type = TextField(css_select='div.main div.status div.booksub a')
    novel_chapter_url = AttrField(css_select='div.main div.status div.book_btn span.list a', attr='href')

    def tal_author(self, author):
        if isinstance(author, list):
            return author[0].text
        else:
            return author

    def tal_status(self, status):
        """
        当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取
        :param ele_tag:
        :return:
        """
        if isinstance(status, list):
            return '#'.join([i.get('title').strip().replace('作品', '') for i in status])
        else:
            return status

    def tal_novels_type(self, novels_type):
        if isinstance(novels_type, list):
            try:
                return novels_type[1].text
            except:
                return ''
        else:
            return ''
Beispiel #9
0
class TestSpider(Item):
    title = TextField(css_select='.book-info>h1>em')
    author = TextField(css_select='a.writer')
    cover = AttrField(css_select='a#bookImg>img', attr='src')
    abstract = TextField(css_select='div.book-intro>p')
    tag = TextField(css_select='span.blue')
    latest_chapter = TextField(css_select='div.detail>p.cf>a')
    latest_chapter_time = TextField(css_select='div.detail>p.cf>em')

    def tal_title(self, title):
        # Clean your target value
        return title

    def tal_cover(self, cover):
        return 'http:' + cover

    def tal_tag(self, ele_tag):
        return '#'.join([i.text for i in ele_tag])

    def tal_latest_chapter_time(self, latest_chapter_time):
        return latest_chapter_time.replace('今天', str(time.strftime("%Y-%m-%d ", time.localtime())))
Beispiel #10
0
 def test_attr_field(self):
     attr_field = AttrField(css_select="p a.test_link", attr='href')
     value = attr_field.extract_value(self.html)
     self.assertEqual(value, "https://github.com/howie6879/talonspider")