Beispiel #1
0
class QidianNovelsItem(Item):
    target_item = TextField(css_select='ul.all-img-list>li')
    novel_url = AttrField(css_select='div.book-img-box>a', attr='href')
    novel_name = TextField(css_select='div.book-mid-info>h4')
    novel_author = TextField(css_select='div.book-mid-info>p.author>a.name')
    novel_author_home_url = AttrField(
        css_select='div.book-mid-info>p.author>a.name', attr='href')
    novel_type = TextField(
        css_select='div.book-mid-info > p.author > a:nth-child(4)')
    novel_cover = AttrField(css_select='div.book-img-box img', attr='src')
    novel_abstract = TextField(css_select='div.book-mid-info p.intro')

    # novel_latest_chapter = TextField(css_select='div.bookupdate a')
    async def clean_novel_url(self, novel_url):
        return 'https:' + novel_url

    async def clean_novel_author(self, novel_author):
        if isinstance(novel_author, list):
            novel_author = novel_author[0].text
        return novel_author

    async def clean_novel_author_home_url(self, novel_author_home_url):
        if isinstance(novel_author_home_url, list):
            novel_author_home_url = novel_author_home_url[0].get(
                'href').strip()
        return 'https:' + novel_author_home_url

    async def clean_novel_cover(self, novel_cover):
        return 'https:' + novel_cover
Beispiel #2
0
class FictionItem(Item):
    target_item = TextField(css_select="dd")
    title = TextField(css_select='a')
    url = AttrField(css_select='a', attr='href')

    async def clean_title(self, value):
        return value
Beispiel #3
0
class HackerNewsItem(Item):
    target_item = TextField(css_select='tr.athing')
    title = TextField(css_select='a.storylink')
    url = AttrField(css_select='a.storylink', attr='href')

    async def clean_title(self, value):
        return value
Beispiel #4
0
class MyItem(Item):
    title = TextField(css_select='.title')
    star = TextField(css_select='.star')
    tags = TextField(css_select='.tag', many=True)

    async def clean_star(self, value):
        return int(value)
Beispiel #5
0
class MyItem(Item):
    title = TextField(css_select=".title")
    star = TextField(css_select=".star")
    tags = TextField(css_select=".tag", many=True)

    async def clean_star(self, value):
        return int(value)
Beispiel #6
0
class MyItem(Item):
    target_item = TextField(css_select=".movie")
    title = TextField(css_select=".title")
    star = TextField(css_select=".star")

    async def clean_star(self, value):
        return int(value)
Beispiel #7
0
class MyItem(Item):
    target_item = TextField(css_select='.movie')
    title = TextField(css_select='.title')
    star = TextField(css_select='.star')

    async def clean_star(self, value):
        return int(value)
Beispiel #8
0
class JianshuItem(Item):
    target_item = TextField(css_select="ul.list>li")
    author_name = TextField(css_select="a.name")
    author_url = AttrField(attr="href", css_select="a.name")

    async def clean_author_url(self, author_url):
        return f"https://www.jianshu.com{author_url}"
class ArticleListItem(Item):
    """
    eg: http://www.ruanyifeng.com/blog/essays/
    """
    target_item = TextField(css_select='div#alpha-inner li.module-list-item')
    title = TextField(css_select='li.module-list-item>a')
    href = AttrField(css_select='li.module-list-item>a', attr='href')
Beispiel #10
0
class HackerNewsItem(Item):
    target_item = TextField(css_select="tr.athing")
    title = TextField(css_select="a.storylink")
    url = AttrField(css_select="a.storylink", attr="href")

    async def clean_title(self, value):
        return value.strip()
Beispiel #11
0
class FishItem(Item):
    target_item = TextField(css_select='div.m_search_list dl')
    title = TextField(css_select='h2 a')
    date = TextField(css_select='dd.search_laiyuan')
    url = AttrField(css_select='h2 a', attr='href')

    async def clean_date(self, value):
        date = value.replace('发布时间:', '')
        return date
Beispiel #12
0
class Data258WechatItem(Item):
    """
    微阅读公众号搜索一级页面信息提取
    示例:https://mp.data258.com/mp/search?type=category&key=老胡的储物柜&sort=
    """

    target_item = TextField(css_select="div.layui-panel")
    wechat_name = TextField(css_select="h2>a", default="")
    wehcat_href = AttrField(css_select="h2>a", attr="href", default="")
Beispiel #13
0
class MyItem(Item):
    target_item = TextField(css_select='.movie')
    title = TextField(css_select=".title")
    star = TextField(css_select=".star")

    @staticmethod
    async def clean_title(value):
        if not value:
            raise IgnoreThisItem
        return value
Beispiel #14
0
class FishItem(Item):
    target_item = TextField(css_select='td [width="530"]')
    title = TextField(css_select='a')
    date = TextField(css_select='td')
    url = AttrField(css_select='a', attr='href')

    async def clean_date(self, value):
        date = value[-10:]
        date = date.replace('.', '-')
        return date
Beispiel #15
0
class DoubanItem(Item):
    target_item = TextField(css_select="div.item")
    title = TextField(css_select="span.title")
    cover = AttrField(css_select="div.pic>a>img", attr="src")
    abstract = TextField(css_select="span.inq", default="")

    async def clean_title(self, title):
        if isinstance(title, str):
            return title
        else:
            return "".join([i.text.strip().replace("\xa0", "") for i in title])
Beispiel #16
0
class DoubanItems(Item):
    target_item = TextField(css_select='div.item')
    title = TextField(css_select='span.title')
    cover = AttrField(css_select='div.pic>a>img', attr='src')
    abstract = TextField(css_select='span.inq')

    async def clean_title(self, title):
        if isinstance(title, str):
            return title
        else:
            return ''.join([i.text.strip().replace('\xa0', '') for i in title])
Beispiel #17
0
class FishItem(Item):
    target_item = TextField(css_select='#info ')
    title = TextField(css_select='h2 a')
    date = TextField(css_select='dd.search_laiyuan')
    url = AttrField(css_select='h2 a', attr='href')

    async def clean_date(self, value):
        date = value.split('(')[1]
        date = date.rstrip(')')
        print('date =', date)
        return date
Beispiel #18
0
class DiseaseHomeItem(Item):
    disease_name = TextField(css_select='h1.ti')
    disease_subject = TextField(css_select='div.table>div:nth-child(1)>span')
    disease_ask_lists = TextField(css_select='div.ask_lists div.lists a',
                                  many=True)
    disease_ask_link_lists = AttrField(css_select='div.ask_lists div.lists a',
                                       attr='href',
                                       many=True)

    async def clean_disease_ask_link_lists(self, disease_ask_link_lists):
        return ['http:' + i for i in disease_ask_link_lists]
Beispiel #19
0
class RankingItem(Item):
    target_item = TextField(css_select='.rank-list')
    ranking_title = AttrField(css_select='h3.wrap-title', attr='html')
    more = AttrField(css_select='h3>a.more', attr='href')
    book_list = TextField(css_select='div.book-list>ul>li')

    async def clean_ranking_title(self, ranking_title):
        if isinstance(ranking_title, list):
            return ranking_title[0].text

    async def clean_more(self, more):
        return "http:" + more
Beispiel #20
0
class DoubanItem(Item):
    """
    定义爬虫的目标字段
    """
    target_item = TextField(css_select='div.item')
    title = TextField(css_select='span.title')

    async def clean_title(self,title):
        if isinstance(title, str):
            return title
        else:
            return ''.join([i.text.strip().replace('\xa0', '') for i in title])
Beispiel #21
0
class PageItem(Item):
    target_item = TextField(css_select='ul.be-pager')
    count = TextField(css_select='span.be-pager-total')

    async def clean_count(self, value):
        nowpgCount = 1
        pgc = re.findall(r"\d+\.?\d*", value)
        if pgc:
            nowpgCount = int(pgc[0])
        else:
            raise Exception("Error:PageItem re.findall -> pageInfo.count")
        return nowpgCount
Beispiel #22
0
class RankingItem(Item):
    target_item = TextField(css_select='.rank-list')
    ranking_title = TextField(css_select='h3.wrap-title')
    more = AttrField(css_select='h3>a.more', attr='href')
    book_list = HtmlField(css_select='div.book-list>ul>li', many=True)

    async def clean_ranking_title(self, ranking_title):
        if isinstance(ranking_title, list):
            return ranking_title[0].text
        else:
            return str(ranking_title).split('榜')[0] + '榜'

    async def clean_more(self, more):
        return "https:" + more
Beispiel #23
0
class Data258WechatListItem(Item):
    """
    微阅读公众号历史文章信息提取
    示例: https://mp.data258.com/article/category/howie_locker
    """

    target_item = TextField(css_select="ul.jie-row>li")
    w_article_title = TextField(css_select="a.jie-title", default="")
    w_article_href = AttrField(css_select="a.jie-title",
                               attr="href",
                               default="")

    async def clean_w_article_title(self, value: list):
        """获取文章标题"""
        return str(value).strip() if value else ""
Beispiel #24
0
class HackerNewsItem(Item):
    """
    定义目标字段抓取规则
    """
    target_item = TextField(css_select='tr.athing')
    title = TextField(css_select='a.storylink')
    url = AttrField(css_select='a.storylink', attr='href')

    async def clean_title(self, value):
        """
        清洗目标数据
        :param value: 初始目标数据
        :return:
        """
        return str(value).strip()
Beispiel #25
0
class MyItem(Item):
    """
    定义爬虫的目标字段
    """
    target_item = HtmlField(css_select='html')
    title = TextField(css_select='head title')
    article = HtmlField(css_select='article')
Beispiel #26
0
class FishItem(Item):
    target_item = TextField(css_select='div.jsearch-result-box')
    title = TextField(css_select='div.jsearch-result-title')
    date = TextField(css_select='span.jsearch-result-date')
    url = AttrField(css_select='div.jsearch-result-title a', attr='href')

    async def clean_date(self, value):
        date = value.rstrip('-')
        date = date.rstrip(' ')
        date = date.rstrip('日')
        date = date.replace('年', '-').replace('月', '-')
        return date

    async def clean_url(self, value):
        url = 'http://www.cast.org.cn' + value
        return url
Beispiel #27
0
class UKSearchItem(Item):
    target_item = TextField(
        css_select=
        "div[data-spm=PhoneSokuThreeProgram_4] > div.pack-cover_1K0Xq")
    title = AttrField(css_select='a.pack-top_2nSnm', attr='data-trackinfo')
    url = AttrField(css_select='a.pack-top_2nSnm', attr='href')
    img = AttrField(css_select='a.pack-top_2nSnm', attr='style')
Beispiel #28
0
class ArchivesItem(Item):
    """
    eg: http://www.ruanyifeng.com/blog/archives.html
    """

    target_item = TextField(css_select="div#beta-inner li.module-list-item")
    href = AttrField(css_select="li.module-list-item>a", attr="href")
Beispiel #29
0
class QidianNovelInfoItem(Item):
    """
    定义继承自item的Item类
    """
    novel_name = TextField(css_select='.book-info>h1>em')
    author = TextField(css_select='a.writer')
    # 当提取的值是属性的时候,要定义AttrField
    cover = AttrField(css_select='a#bookImg>img', attr='src')
    abstract = TextField(css_select='div.book-intro>p')
    status = TextField(css_select='p.tag>span.blue')
    novels_type = TextField(css_select='p.tag>a.red')
    latest_chapter = TextField(css_select='li.update>div.detail>p.cf>a')
    latest_chapter_time = TextField(css_select='div.detail>p.cf>em')

    async def clean_cover(self, cover):
        return 'http:' + cover

    async def clean_status(self, status):
        """
        当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取
        :param ele_tag:
        :return:
        """
        return '#'.join([i.text for i in status])

    async def clean_novels_type(self, novels_type):
        return '#'.join([i.text for i in novels_type])

    async def clean_latest_chapter_time(self, latest_chapter_time):
        return latest_chapter_time.replace(
            u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace(
                u'昨日',
                str(
                    time.strftime("%Y-%m-%d ",
                                  time.localtime(time.time() - 24 * 60 * 60))))
Beispiel #30
0
class DiseaseItem(Item):
    disease_name = TextField(css_select='div.keshi_list>a', many=True)
    disease_url = AttrField(css_select='div.keshi_list>a',
                            attr='href',
                            many=True)

    async def clean_disease_url(self, disease_url):
        domain = 'https://m.120ask.com'
        return [urljoin(domain, i) for i in disease_url]