コード例 #1
0
class RankingItem(Item):
    target_item = TextField(css_select='.rank-list')
    ranking_title = TextField(css_select='h3.wrap-title')
    more = AttrField(css_select='h3>a.more', attr='href')
    book_list = TextField(css_select='div.book-list>ul>li')

    def tal_more(self, more):
        return "http:" + more
コード例 #2
0
ファイル: test_item.py プロジェクト: zhuyoucai168/talospider
class DoubanItemsSpider(Item):
    target_item = TextField(css_select='div.item')
    title = TextField(css_select='span.title')
    cover = AttrField(css_select='div.pic>a>img', attr='src')
    abstract = TextField(css_select='span.inq')

    def tal_title(self, title):
        if isinstance(title, str):
            return title
        else:
            return ''.join([i.text.strip().replace('\xa0', '') for i in title])
コード例 #3
0
class RankingItem(Item):
    target_item = TextField(css_select='.rank-list')
    ranking_title = AttrField(css_select='h3.wrap-title',attr='html')
    more = AttrField(css_select='h3>a.more', attr='href')
    book_list = TextField(css_select='div.book-list>ul>li')

    def tal_ranking_title(self,ranking_title):
        if isinstance(ranking_title,list):
            return ranking_title[0].text

    def tal_more(self, more):
        return "http:" + more
コード例 #4
0
ファイル: zongheng_all_novels.py プロジェクト: zys58/owllook
class ZHNovelsItem(Item):
    target_item = TextField(css_select='ul.main_con>li')
    novel_url = AttrField(css_select='span.chap>a.fs14', attr='href')
    novel_name = TextField(css_select='span.chap>a.fs14')
    novel_author = TextField(css_select='span.author>a')
    novel_author_home_url = AttrField(css_select='span.author>a', attr='href')

    # def tal_novel_url(self, novel_url):
    # return 'http:' + novel_url

    def tal_novel_author(self, novel_author):
        if novel_author:
            if isinstance(novel_author, list):
                novel_author = novel_author[0].text
            return novel_author
        else:
            return ''
コード例 #5
0
class QidianNovelInfoItem(Item):
    """
    定义继承自item的Item类
    """
    novel_name = TextField(css_select='.book-info>h1>em')
    author = TextField(css_select='a.writer')
    # 当提取的值是属性的时候,要定义AttrField
    cover = AttrField(css_select='a#bookImg>img', attr='src')
    abstract = TextField(css_select='div.book-intro>p')
    status = TextField(css_select='p.tag>span.blue')
    novels_type = TextField(css_select='p.tag>a.red')
    latest_chapter = TextField(css_select='li.update>div.detail>p.cf>a')
    latest_chapter_time = TextField(css_select='div.detail>p.cf>em')

    def tal_cover(self, cover):
        return 'http:' + cover

    def tal_status(self, status):
        """
        当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取
        :param ele_tag:
        :return:
        """
        return '#'.join([i.text for i in status])

    def tal_novels_type(self, novels_type):
        return '#'.join([i.text for i in novels_type])

    def tal_latest_chapter_time(self, latest_chapter_time):
        return latest_chapter_time.replace(
            u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace(
                u'昨日',
                str(
                    time.strftime("%Y-%m-%d ",
                                  time.localtime(time.time() - 24 * 60 * 60))))
コード例 #6
0
class QidianNovelsItem(Item):
    target_item = TextField(css_select='ul.all-img-list>li')
    novel_url = AttrField(css_select='div.book-img-box>a', attr='href')
    novel_name = TextField(css_select='div.book-mid-info>h4')
    novel_author = TextField(css_select='div.book-mid-info>p.author>a.name')
    novel_author_home_url = AttrField(
        css_select='div.book-mid-info>p.author>a.name', attr='href')

    def tal_novel_url(self, novel_url):
        return 'https:' + novel_url

    def tal_novel_author(self, novel_author):
        if isinstance(novel_author, list):
            novel_author = novel_author[0].text
        return novel_author

    def tal_novel_author_home_url(self, novel_author_home_url):
        if isinstance(novel_author_home_url, list):
            novel_author_home_url = novel_author_home_url[0].get(
                'href').strip()
        return 'http:' + novel_author_home_url
コード例 #7
0
class ZHNovelInfoItem(Item):
    """
    定义继承自item的Item类
    """
    novel_name = TextField(css_select='div.main div.status h1 a')
    author = TextField(css_select='div.main div.status div.booksub a')
    # 当提取的值是属性的时候,要定义AttrField
    cover = AttrField(css_select='div.main div.book_cover img', attr='src')
    abstract = TextField(css_select='div.main div.status div.info_con p')
    status = AttrField(css_select='div.main div.status h1 em', attr='title')
    novels_type = TextField(css_select='div.main div.status div.booksub a')
    novel_chapter_url = AttrField(
        css_select='div.main div.status div.book_btn span.list a', attr='href')

    def tal_author(self, author):
        if isinstance(author, list):
            return author[0].text
        else:
            return author

    def tal_status(self, status):
        """
        当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取
        :param ele_tag:
        :return:
        """
        if isinstance(status, list):
            return '#'.join(
                [i.get('title').strip().replace('作品', '') for i in status])
        else:
            return status

    def tal_novels_type(self, novels_type):
        if isinstance(novels_type, list):
            try:
                return novels_type[1].text
            except:
                return ''
        else:
            return ''
コード例 #8
0
ファイル: talospider_test.py プロジェクト: zys58/owllook
class TestSpider(Item):
    title = TextField(css_select='.book-info>h1>em')
    author = TextField(css_select='a.writer')
    cover = AttrField(css_select='a#bookImg>img', attr='src')
    abstract = TextField(css_select='div.book-intro>p')
    tag = TextField(css_select='span.blue')
    latest_chapter = TextField(css_select='li.update>div.detail>p.cf>a')
    latest_chapter_time = TextField(css_select='div.detail>p.cf>em')

    def tal_title(self, title):
        # Clean your target value
        return title

    def tal_cover(self, cover):
        return 'http:' + cover

    def tal_tag(self, ele_tag):
        return '#'.join([i.text for i in ele_tag])

    def tal_latest_chapter_time(self, latest_chapter_time):
        return latest_chapter_time.replace(
            '今天', str(time.strftime("%Y-%m-%d ", time.localtime())))
コード例 #9
0
class NameItem(Item):
    top_name = TextField(css_select='h4>a')
    other_name = TextField(css_select='a.name')
コード例 #10
0
ファイル: qidian_honor_spider.py プロジェクト: zys58/owllook
class QidianHonorItem(Item):
    target_item = TextField(css_select='li.cf')
    honor_text = TextField(css_select='span.decs')
    honor_time = TextField(css_select='span.time')
コード例 #11
0
ファイル: test_fields.py プロジェクト: JXtreehouse/talospider
 def test_xpath_select(self):
     field = TextField(xpath_select='/html/head/title')
     value = field.extract_value(self.html)
     self.assertEqual(value, "talonspider")
コード例 #12
0
ファイル: test_fields.py プロジェクト: JXtreehouse/talospider
 def test_css_select(self):
     field = TextField(css_select="head title")
     value = field.extract_value(self.html)
     self.assertEqual(value, "talonspider")
コード例 #13
0
ファイル: test_item.py プロジェクト: zhuyoucai168/talospider
class DoubanItemSpider(Item):
    title = TextField(css_select='head title')

    def tal_title(self, title):
        return title