Ejemplo n.º 1
0
def test_attr_field_with_default_and_many():
    field = AttrField(css_select="div.brand b",
                      attr="href",
                      default="nothing",
                      many=True)
    values = field.extract(html_etree=html_etree)
    assert values == ["nothing"]
Ejemplo n.º 2
0
class UKSearchItem(Item):
    target_item = TextField(
        css_select=
        "div[data-spm=PhoneSokuThreeProgram_4] > div.pack-cover_1K0Xq")
    title = AttrField(css_select='a.pack-top_2nSnm', attr='data-trackinfo')
    url = AttrField(css_select='a.pack-top_2nSnm', attr='href')
    img = AttrField(css_select='a.pack-top_2nSnm', attr='style')
Ejemplo n.º 3
0
class QidianNovelsItem(Item):
    target_item = TextField(css_select='ul.all-img-list>li')
    novel_url = AttrField(css_select='div.book-img-box>a', attr='href')
    novel_name = TextField(css_select='div.book-mid-info>h4')
    novel_author = TextField(css_select='div.book-mid-info>p.author>a.name')
    novel_author_home_url = AttrField(
        css_select='div.book-mid-info>p.author>a.name', attr='href')
    novel_type = TextField(
        css_select='div.book-mid-info > p.author > a:nth-child(4)')
    novel_cover = AttrField(css_select='div.book-img-box img', attr='src')
    novel_abstract = TextField(css_select='div.book-mid-info p.intro')

    # novel_latest_chapter = TextField(css_select='div.bookupdate a')
    async def clean_novel_url(self, novel_url):
        return 'https:' + novel_url

    async def clean_novel_author(self, novel_author):
        if isinstance(novel_author, list):
            novel_author = novel_author[0].text
        return novel_author

    async def clean_novel_author_home_url(self, novel_author_home_url):
        if isinstance(novel_author_home_url, list):
            novel_author_home_url = novel_author_home_url[0].get(
                'href').strip()
        return 'https:' + novel_author_home_url

    async def clean_novel_cover(self, novel_cover):
        return 'https:' + novel_cover
Ejemplo n.º 4
0
class RankingItem(Item):
    target_item = TextField(css_select='.rank-list')
    ranking_title = AttrField(css_select='h3.wrap-title', attr='html')
    more = AttrField(css_select='h3>a.more', attr='href')
    book_list = TextField(css_select='div.book-list>ul>li')

    async def clean_ranking_title(self, ranking_title):
        if isinstance(ranking_title, list):
            return ranking_title[0].text

    async def clean_more(self, more):
        return "http:" + more
Ejemplo n.º 5
0
class HYNovelInfoItem(Item):
    """
    定义继承自item的Item类
    """
    novel_name = AttrField(css_select="meta[property='og:title']",
                           attr='content')
    author = AttrField(css_select="meta[property='og:novel:author']",
                       attr='content')
    cover = AttrField(css_select="meta[property='og:image']", attr='content')
    abstract = AttrField(css_select="meta[property='og:description']",
                         attr='content')
    status = AttrField(css_select="meta[property='og:novel:status']",
                       attr='content')
    novels_type = AttrField(css_select="meta[property='og:novel:category']",
                            attr='content')
    novel_chapter_url = AttrField(css_select='div#voteList a.index',
                                  attr='href')
    latest_chapter = AttrField(
        css_select="meta[property='og:novel:latest_chapter_name']",
        attr='content')
    latest_chapter_url = AttrField(
        css_select="meta[property='og:novel:latest_chapter_url']",
        attr='content')
    latest_chapter_time = AttrField(
        css_select="meta[property='og:novel:update_time']", attr='content')

    # novel_name = TextField(css_select='div.c-left>div.mod>div.hd>h2')
    # author = TextField(css_select='div.author-zone div.right a.name strong')
    # cover = AttrField(css_select='img.book-cover', attr='src')
    # abstract = TextField(css_select='pre.note')
    # status = ''
    # novels_type = TextField(css_select='div.c-left>div.mod>div.hd>p.infos>span.cate>a')
    # latest_chapter = ''
    # novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
    async def clean_cover(self, cover):
        if 'https' in cover:
            return cover
        else:
            return cover.replace('http', 'https')

    async def clean_novels_type(self, novels_type):
        types_dict = {'社会': '都市'}
        print(types_dict.get(str(novels_type).strip(), novels_type))
        return types_dict.get(str(novels_type).strip(), novels_type)

    async def clean_latest_chapter_time(self, latest_chapter_time):
        return latest_chapter_time.replace(
            u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace(
                u'昨日',
                str(
                    time.strftime("%Y-%m-%d ",
                                  time.localtime(time.time() - 24 * 60 * 60))))
class ArticleListItem(Item):
    """
    eg: http://www.ruanyifeng.com/blog/essays/
    """
    target_item = TextField(css_select='div#alpha-inner li.module-list-item')
    title = TextField(css_select='li.module-list-item>a')
    href = AttrField(css_select='li.module-list-item>a', attr='href')
Ejemplo n.º 7
0
class FictionItem(Item):
    target_item = TextField(css_select="dd")
    title = TextField(css_select='a')
    url = AttrField(css_select='a', attr='href')

    async def clean_title(self, value):
        return value
Ejemplo n.º 8
0
class QidianNovelInfoItem(Item):
    """
    定义继承自item的Item类
    """
    novel_name = TextField(css_select='.book-info>h1>em')
    author = TextField(css_select='a.writer')
    # 当提取的值是属性的时候,要定义AttrField
    cover = AttrField(css_select='a#bookImg>img', attr='src')
    abstract = TextField(css_select='div.book-intro>p')
    status = TextField(css_select='p.tag>span.blue')
    novels_type = TextField(css_select='p.tag>a.red')
    latest_chapter = TextField(css_select='li.update>div.detail>p.cf>a')
    latest_chapter_time = TextField(css_select='div.detail>p.cf>em')

    async def clean_cover(self, cover):
        return 'http:' + cover

    async def clean_status(self, status):
        """
        当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取
        :param ele_tag:
        :return:
        """
        return '#'.join([i.text for i in status])

    async def clean_novels_type(self, novels_type):
        return '#'.join([i.text for i in novels_type])

    async def clean_latest_chapter_time(self, latest_chapter_time):
        return latest_chapter_time.replace(
            u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace(
                u'昨日',
                str(
                    time.strftime("%Y-%m-%d ",
                                  time.localtime(time.time() - 24 * 60 * 60))))
Ejemplo n.º 9
0
class JianshuItem(Item):
    target_item = TextField(css_select="ul.list>li")
    author_name = TextField(css_select="a.name")
    author_url = AttrField(attr="href", css_select="a.name")

    async def clean_author_url(self, author_url):
        return f"https://www.jianshu.com{author_url}"
Ejemplo n.º 10
0
class HackerNewsItem(Item):
    target_item = TextField(css_select='tr.athing')
    title = TextField(css_select='a.storylink')
    url = AttrField(css_select='a.storylink', attr='href')

    async def clean_title(self, value):
        return value
Ejemplo n.º 11
0
class HackerNewsItem(Item):
    target_item = TextField(css_select="tr.athing")
    title = TextField(css_select="a.storylink")
    url = AttrField(css_select="a.storylink", attr="href")

    async def clean_title(self, value):
        return value.strip()
Ejemplo n.º 12
0
class ArchivesItem(Item):
    """
    eg: http://www.ruanyifeng.com/blog/archives.html
    """

    target_item = TextField(css_select="div#beta-inner li.module-list-item")
    href = AttrField(css_select="li.module-list-item>a", attr="href")
Ejemplo n.º 13
0
class FishItem(Item):
    target_item = TextField(css_select='div.m_search_list dl')
    title = TextField(css_select='h2 a')
    date = TextField(css_select='dd.search_laiyuan')
    url = AttrField(css_select='h2 a', attr='href')

    async def clean_date(self, value):
        date = value.replace('发布时间:', '')
        return date
Ejemplo n.º 14
0
class Data258WechatItem(Item):
    """
    微阅读公众号搜索一级页面信息提取
    示例:https://mp.data258.com/mp/search?type=category&key=老胡的储物柜&sort=
    """

    target_item = TextField(css_select="div.layui-panel")
    wechat_name = TextField(css_select="h2>a", default="")
    wehcat_href = AttrField(css_select="h2>a", attr="href", default="")
Ejemplo n.º 15
0
class DiseaseItem(Item):
    disease_name = TextField(css_select='div.keshi_list>a', many=True)
    disease_url = AttrField(css_select='div.keshi_list>a',
                            attr='href',
                            many=True)

    async def clean_disease_url(self, disease_url):
        domain = 'https://m.120ask.com'
        return [urljoin(domain, i) for i in disease_url]
Ejemplo n.º 16
0
class FishItem(Item):
    target_item = TextField(css_select='td [width="530"]')
    title = TextField(css_select='a')
    date = TextField(css_select='td')
    url = AttrField(css_select='a', attr='href')

    async def clean_date(self, value):
        date = value[-10:]
        date = date.replace('.', '-')
        return date
Ejemplo n.º 17
0
class FishItem(Item):
    target_item = TextField(css_select='#info ')
    title = TextField(css_select='h2 a')
    date = TextField(css_select='dd.search_laiyuan')
    url = AttrField(css_select='h2 a', attr='href')

    async def clean_date(self, value):
        date = value.split('(')[1]
        date = date.rstrip(')')
        print('date =', date)
        return date
Ejemplo n.º 18
0
class DoubanItems(Item):
    target_item = TextField(css_select='div.item')
    title = TextField(css_select='span.title')
    cover = AttrField(css_select='div.pic>a>img', attr='src')
    abstract = TextField(css_select='span.inq')

    async def clean_title(self, title):
        if isinstance(title, str):
            return title
        else:
            return ''.join([i.text.strip().replace('\xa0', '') for i in title])
Ejemplo n.º 19
0
class DoubanItem(Item):
    target_item = TextField(css_select="div.item")
    title = TextField(css_select="span.title")
    cover = AttrField(css_select="div.pic>a>img", attr="src")
    abstract = TextField(css_select="span.inq", default="")

    async def clean_title(self, title):
        if isinstance(title, str):
            return title
        else:
            return "".join([i.text.strip().replace("\xa0", "") for i in title])
Ejemplo n.º 20
0
class DiseaseHomeItem(Item):
    disease_name = TextField(css_select='h1.ti')
    disease_subject = TextField(css_select='div.table>div:nth-child(1)>span')
    disease_ask_lists = TextField(css_select='div.ask_lists div.lists a',
                                  many=True)
    disease_ask_link_lists = AttrField(css_select='div.ask_lists div.lists a',
                                       attr='href',
                                       many=True)

    async def clean_disease_ask_link_lists(self, disease_ask_link_lists):
        return ['http:' + i for i in disease_ask_link_lists]
Ejemplo n.º 21
0
class ZHNovelsItem(Item):
    target_item = TextField(css_select='div.store_collist div.bookbox')
    novel_url = AttrField(css_select='div.bookinfo div.bookname a',
                          attr='href')
    novel_title = TextField(css_select='div.bookinfo div.bookname a')
    novel_author = TextField(css_select='div.bookilnk a:nth-child(1)')
    novel_type = TextField(css_select='div.bookilnk a:nth-child(2)')
    novel_cover = AttrField(css_select='div.bookimg img', attr='src')
    novel_abstract = TextField(css_select='div.bookintro')
    novel_lastest_update = TextField(css_select='div.bookupdate a')

    async def clean_novel_url(self, novel_url):
        return novel_url.replace('/book/', '/showchapter/')

    async def clean_novel_author(self, novel_author):
        if novel_author:
            if isinstance(novel_author, list):
                novel_author = novel_author[0].text
            return novel_author
        else:
            return ''

    async def clean_novel_url(self, novel_abstract):
        return novel_abstract.replace('\\r',
                                      '').replace('\\n',
                                                  '').replace(r'\u3000', '')

        # def tal_novel_author_home_url(self, novel_author_home_url):
        #     if isinstance(novel_author_home_url, list):
        #         novel_author_home_url = novel_author_home_url[0].get('href').strip()
        #     return 'http:' + novel_author_home_url

    async def save(self, res_dic):
        # 存进es
        try:
            await self.es.Index_Data(res_dic)
            #self.logger.info("插入成功")
            return True
        except Exception as e:
            self.logger.exception(e)
            return False
Ejemplo n.º 22
0
class ZHNovelInfoItem(Item):
    """
    定义继承自item的Item类
    """
    novel_name = TextField(css_select='div.main div.status h1 a')
    author = TextField(css_select='div.main div.status div.booksub a')
    # 当提取的值是属性的时候,要定义AttrField
    cover = AttrField(css_select='div.main div.book_cover img', attr='src')
    abstract = TextField(css_select='div.main div.status div.info_con p')
    status = AttrField(css_select='div.main div.status h1 em', attr='title')
    novels_type = TextField(css_select='div.main div.status div.booksub a')
    novel_chapter_url = AttrField(
        css_select='div.main div.status div.book_btn span.list a', attr='href')

    async def clean_author(self, author):
        if isinstance(author, list):
            return author[0].text
        else:
            return author

    async def clean_status(self, status):
        """
        当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取
        :param ele_tag:
        :return:
        """
        if isinstance(status, list):
            return '#'.join(
                [i.get('title').strip().replace('作品', '') for i in status])
        else:
            return status

    async def clean_novels_type(self, novels_type):
        if isinstance(novels_type, list):
            try:
                return novels_type[1].text
            except:
                return ''
        else:
            return ''
Ejemplo n.º 23
0
class ZHNovelsItem(Item):
    target_item = TextField(css_select='div.store_collist div.bookbox')
    novel_url = AttrField(css_select='div.bookinfo div.bookname a',
                          attr='href')
    novel_name = TextField(css_select='div.bookinfo div.bookname a')
    novel_author = TextField(css_select='div.bookilnk a:nth-child(1)')
    novel_author_home_url = AttrField(css_select='div.bookilnk a:nth-child(1)',
                                      attr='href')
    novel_type = TextField(css_select='div.bookilnk a:nth-child(2)')
    novel_cover = AttrField(css_select='div.bookimg img', attr='src')
    novel_abstract = TextField(css_select='div.bookintro')
    novel_latest_chapter = TextField(css_select='div.bookupdate a')

    # def tal_novel_url(self, novel_url):
    # return 'http:' + novel_url
    async def clean_novel_author(self, novel_author):
        if novel_author:
            if isinstance(novel_author, list):
                novel_author = novel_author[0].text
            return novel_author
        else:
            return ''
Ejemplo n.º 24
0
class QidianNovelsItem(Item):
    target_item = TextField(css_select='div.book-img-text>ul>li')
    novel_url = AttrField(css_select='div.book-img-box>a', attr='href')
    novel_title = TextField(css_select='div.book-mid-info>h4')
    novel_author = TextField(css_select='div.book-mid-info>p.author>a.name')
    novel_type = TextField(
        css_select='div.book-mid-info > p.author > a:nth-child(4)')
    novel_status = TextField(css_select='div.book-mid-info>p.author>span')
    novel_cover = AttrField(css_select='div.book-img-box img', attr='src')
    novel_abstract = TextField(css_select='div.book-mid-info p.intro')
    novel_lastest_update = TextField(css_select='div.book-mid-info > p.update')

    async def clean_novel_url(self, novel_url):
        return 'https:' + novel_url

    async def clean_novel_author(self, novel_author):
        if isinstance(novel_author, list):
            novel_author = novel_author[0].text
        return novel_author

    async def clean_novel_cover(self, novel_cover):
        return 'https:' + novel_cover
Ejemplo n.º 25
0
class RankingItem(Item):
    target_item = TextField(css_select='.rank-list')
    ranking_title = TextField(css_select='h3.wrap-title')
    more = AttrField(css_select='h3>a.more', attr='href')
    book_list = HtmlField(css_select='div.book-list>ul>li', many=True)

    async def clean_ranking_title(self, ranking_title):
        if isinstance(ranking_title, list):
            return ranking_title[0].text
        else:
            return str(ranking_title).split('榜')[0] + '榜'

    async def clean_more(self, more):
        return "https:" + more
Ejemplo n.º 26
0
class Data258WechatListItem(Item):
    """
    微阅读公众号历史文章信息提取
    示例: https://mp.data258.com/article/category/howie_locker
    """

    target_item = TextField(css_select="ul.jie-row>li")
    w_article_title = TextField(css_select="a.jie-title", default="")
    w_article_href = AttrField(css_select="a.jie-title",
                               attr="href",
                               default="")

    async def clean_w_article_title(self, value: list):
        """获取文章标题"""
        return str(value).strip() if value else ""
Ejemplo n.º 27
0
class HackerNewsItem(Item):
    """
    定义目标字段抓取规则
    """
    target_item = TextField(css_select='tr.athing')
    title = TextField(css_select='a.storylink')
    url = AttrField(css_select='a.storylink', attr='href')

    async def clean_title(self, value):
        """
        清洗目标数据
        :param value: 初始目标数据
        :return:
        """
        return str(value).strip()
Ejemplo n.º 28
0
class FishItem(Item):
    target_item = TextField(css_select='div.jsearch-result-box')
    title = TextField(css_select='div.jsearch-result-title')
    date = TextField(css_select='span.jsearch-result-date')
    url = AttrField(css_select='div.jsearch-result-title a', attr='href')

    async def clean_date(self, value):
        date = value.rstrip('-')
        date = date.rstrip(' ')
        date = date.rstrip('日')
        date = date.replace('年', '-').replace('月', '-')
        return date

    async def clean_url(self, value):
        url = 'http://www.cast.org.cn' + value
        return url
Ejemplo n.º 29
0
class SGWechatItem(Item):
    """
    搜索搜狗微信公众号页面信息提取类,一般是只会有一个结果
    示例:https://weixin.sogou.com/weixin?query=老胡的储物柜
    """

    # 默认此页面是多行内容列表
    target_item = TextField(css_select="div.news-box>ul>li")
    wechat_name = TextField(css_select="p.tit>a", default="")
    wechat_id = TextField(css_select='label[name="em_weixinhao"]', default="")
    latest_title = TextField(css_select='dd>a[target="_blank"]',
                             default="暂无更新")
    latest_href = AttrField(css_select='dd>a[target="_blank"]',
                            attr="href",
                            default="")

    async def clean_wechat_name(self, wechat_name: str) -> str:
        """
        清洗 wechat_name
        """
        return str(wechat_name).replace("\n", "").replace(" ", "").strip()

    async def clean_wechat_id(self, wechat_id: str) -> str:
        """
        清洗 wechat_id
        """
        return str(wechat_id).strip()

    async def clean_latest_title(self, latest_title: str) -> str:
        """
        清洗 latest_title
        """
        return str(latest_title).replace("\n", "").replace(" ", "").strip()

    async def clean_latest_href(self, latest_href: str) -> str:
        """
        清洗 latest_href
        """
        f_url = ""
        if latest_href:
            f_url = f"https://weixin.sogou.com/{latest_href}"
        return f_url
Ejemplo n.º 30
0
    def refresh_user_ns(self, request: Request, response: Response):
        """
        Refresh the user namespace
        :param request: ruia.Request
        :param response: ruia.Response
        :return:
        """
        import ruia

        self.user_ns["asyncio"] = asyncio
        self.user_ns["ruia"] = ruia
        self.user_ns["request"] = request
        self.user_ns["response"] = response
        self.user_ns["fetch"] = self.fetch
        self.user_ns[
            "attr_field"] = lambda etree=response.etree, **kwargs: AttrField(
                **kwargs).extract(etree)
        self.user_ns[
            "text_field"] = lambda etree=response.etree, **kwargs: TextField(
                **kwargs).extract(etree)