Example #1
0
    def parse_item(self, response):
        article = response.xpath('//div[@class="article"]')

        # 标题
        title = article.xpath('.//h1/text()').get()

        # 发布时间
        publish_time = article.xpath(
            './/span[@class="publish-time"]/text()').get()
        publish_time = publish_time.strip('*')
        # 当前页url
        page_url = response.url

        # 用户url
        user_home = response.urljoin(
            article.xpath('.//a[@class="avatar"]/@href').get())

        # 内容
        content = article.xpath('//div[@class="show-content-free"]').get()

        data_element = response.xpath('//script[@data-name="page-data"]')

        # 作者
        author = data_element.re(r'"nickname":"([^"]+)",')

        author = author[0] if author else ''

        # 字数
        words_count = data_element.re(r'"public_wordage":([^"]+?),')
        words_count = int(words_count[0]) if words_count else 0

        # 评论数
        comments_count = data_element.re(r'"comments_count":([^"]+?),')
        comments_count = int(comments_count[0]) if comments_count else ''

        # 喜欢数量
        likes_count = data_element.re(r'"likes_count":([^"]+?),')
        likes_count = int(likes_count[0]) if likes_count else 0

        # 阅读数
        views_count = data_element.re(r'"views_count":([^"]+?),')
        views_count = int(views_count[0]) if views_count else 0

        special_id = data_element.re(r'"id":([^"]+?),')
        special_id = int(special_id[0]) if special_id else 0

        item = JianshuItem(
            title=title,
            publish_time=publish_time,
            page_url=page_url,
            user_home=user_home,
            content=content,
            author=author,
            words_count=words_count,
            comments_count=comments_count,
            likes_count=likes_count,
            views_count=views_count,
            special_id=special_id,
        )
        yield item
    def parse_detail(self, response):
        title = response.xpath("//h1[@class='title']/text()").get()
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()
        author = response.xpath("//span[@class='name']/a/text()").get()
        pub_time = response.xpath(
            "//span[@class='publish-time']/text()").get().replace("*", "")
        #获取文章id
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split("/")[-1]
        #文章内容,包括标签,而不是存文本内容
        content = response.xpath("//div[@class='show-content']").get()
        # word_count = response.xpath("//span[@class='wordage']/text()").get()
        # comment_count = response.xpath("//span[@class='comments-count']/text()").get()
        # read_count = response.xpath("//span[@class='views-count']/text()").get()
        # like_count = response.xpath("//span[@class='likes-count']/text()").get()
        # subjects = ",".join(response.xpath("//div[@class='include-collection']/a/div/text()").getall())

        item = JianshuItem(
            title=title,
            avatar=avatar,
            pub_time=pub_time,
            author=author,
            origin_url=response.url,
            content=content,
            article_id=article_id,
            # subjects=subjects,
            # word_count=word_count,
            # comment_count=comment_count,
            # like_count=like_count,
            # read_count=read_count
        )
        yield item
Example #3
0
    def parse(self, response):
        # print(response)
        item = JianshuItem()
        info = response.xpath('.//div[@class="collection-wrap"]')
        if not info:
            return
        # print(info)
        # print(len(info))
        for i in info:
            name = i.xpath('a[1]/h4/text()').extract()[0]
            content = i.xpath('a[1]/p/text()').extract()
            article_num = i.xpath('div/a/text()').extract()[0]
            fans = i.xpath('div/text()').extract()[0]
            if content:
                content = content[0]
            else:
                content = ''
            # print(name, content, article_num, fans)
            # print('='*30)
            item['name'] = name
            item['content'] = content
            item['article_num'] = article_num
            item['fans'] = fans
            yield item

        base_url = 'https://www.jianshu.com/recommendations/collections?page={}&order_by=hot'
        urls = (base_url.format(str(i)) for i in range(1, 21))
        for url in urls:
            yield Request(url, callback=self.parse)
Example #4
0
 def parse_item(self, response):
     title = response.xpath(
         "//div[@class='article']/h1[@class='title']/text()").get()
     author = response.xpath(
         "//div[@class='info']/span[@class='name']/a/text()").get()
     publish_time = response.xpath(
         "//div[@class='meta']/span[@class='publish-time']/text()").get()
     word = response.xpath(
         "//div[@class='meta']/span[@class='wordage']/text()").get()
     content = response.xpath(
         "//div[@class='show-content-free']/p//text()").getall()
     content = "".join(content).replace("\u3000", "").replace("\xa0",
                                                              "").strip()
     url = response.url
     id = url.split("?")[0].split("/")[-1]
     view_count = response.xpath(
         "//div[@class='meta']/span[@class='view-count']/text()").get()
     comment_count = response.xpath(
         "//div[@class='meta']/span[@class='comments-count']/text()").get()
     like_count = response.xpath(
         "//div[@class='meta']/span[@class='likes-count']/text()").get()
     reward_count = response.xpath(
         "//div[@class='meta']/span[@class='rewards-count']/text()").get()
     item = JianshuItem(title=title,
                        author=author,
                        publish_time=publish_time,
                        word=word,
                        content=content,
                        url=url,
                        id=id,
                        view_count=view_count,
                        comment_count=comment_count,
                        like_count=like_count,
                        reward_count=reward_count)
     yield item
Example #5
0
    def parse(self, response):
        item = JianshuItem()
        selector = Selector(response)
        infos = selector.xpath('//ul[@class="note-list"]/li')
        for info in infos:
            user = info.xpath('div/div[1]/div/a/text()').extract()[0]
            time = info.xpath('div/div[1]/div/span/@data-shared-at').extract()[0]
            title = info.xpath('div/a/text()').extract()[0]
            view = info.xpath('div/div[2]/a[1]/text()').extract()[1].strip()
            comment = info.xpath('div/div[2]/a[2]/text()').extract()[1].strip()
            like = info.xpath('div/div[2]/span[1]/text()').extract()[0].strip()
            gain = info.xpath('div/div[2]/span[2]/text()').extract()
            if gain:
                gain = gain[0].strip()
            else:
                gain = '0'


            item['user'] = user
            item['time'] = time
            item['title'] = title
            item['view'] = view
            item['comment'] = comment
            item['like'] = like
            item['gain'] = gain

            yield item

        urls = ['https://www.jianshu.com/c/bd08b5306eb6?order_by=added_at&page={}'.format(str(i)) for i in range(2, 3)]
        for url in urls:
            yield Request(url, callback=self.parse)
Example #6
0
 def parse_item(self, response):
     title = response.xpath(
         "//div[@class='article']/h1[@class='title']/text()").get().strip()
     avatar = response.xpath(
         "//div[@class='article']/div[@class='author']/a/img/@src").get(
         ).strip()
     author = response.xpath(
         "//div[@class='article']//div[@class='info']/span/a/text()").get(
         ).strip()
     pub_time = response.xpath(
         "//div[@class='meta']/span[@class='publish-time']/text()").get(
         ).strip()
     origin_url = response.url
     # /p/7ba4ea51d56c?utm_campaign=maleskine&utm_content=note&utm_medium=seo_notes&utm_source=recommendation
     # /p/7ba4ea51d56c
     # url里面只能有一个? 然后取第一个位 在通过/分割取最后一位
     author_id = response.url.split('?')[0].split('/')[-1]
     content = response.xpath("//div[@class='show-content-free']").get()
     item = JianshuItem(title=title,
                        avatar=avatar,
                        author=author,
                        pub_time=pub_time,
                        origin_url=origin_url,
                        author_id=author_id,
                        content=content)
     yield item
Example #7
0
    def parse(self, response):
        item = JianshuItem()
        selector = Selector(response)
        infos = selector.xpath('//div[@class="collection-wrap"]')
        for info in infos:
            name = info.xpath('a[1]/h4/text()').extract()[0]
            content = info.xpath('a[1]/p/text()').extract()
            article = info.xpath('div/a/text()').extract()[0]
            fans = info.xpath('div/text()').extract()[0]

            if content:
                content = content[0]
            else:
                content = ' '

            item['name'] = name
            item['content'] = content
            item['article'] = article
            item['fans'] = fans
            yield item
        urls = [
            'https://www.jianshu.com/recommendations/collections?page={}&order_by=hot'
            .format(str(i)) for i in range(2, 21)
        ]
        for url in urls:
            yield Request(url, callback=self.parse)
Example #8
0
File: js.py Project: lei025/pyyyy
    def parse_item(self, response):
        item = {}
        # 获取内容页数据并解析数据
        title = response.xpath("//h1[@class='title']/text()").get()
        #作者图像
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()
        author = response.xpath("//span[@class='name']/a/text()").get()
        #发布时间
        pub_time = response.xpath("//span[@class='publish-time']/text()").get()
        #详情页id
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split("/")[-1]
        #文章内容
        content = response.xpath(
            "string(//div[@class='show-content'])").extract()
        content = [
            i.strip().replace('\n', '').replace('\xa0', '') for i in content
            if i.strip()
        ]

        item = JianshuItem(title=title,
                           avatar=avatar,
                           author=author,
                           pub_time=pub_time,
                           origin_url=response.url,
                           article_id=article_id,
                           content=content)

        return item
Example #9
0
 def parse_detail(self, response):
     title = response.xpath('//h1[@class="_1RuRku"]/text()').get()
     author = response.xpath('//span[@class="FxYr8x"]/a/text()').get()
     avatar = response.xpath('//div[@class="_2mYfmT"]/a/img/@src').get()
     pub_time = response.xpath('//div[@class="s-dsoj"]/time/text()').get()
     # id直接拿
     # https://www.jianshu.com/p/9713ff94c4a5
     article_url1 = response.url
     # 以问号切割
     article_url2 = article_url1.split("?")[0]
     article_id = article_url2.split('/')[-1]
     origin_url = response.url
     content = response.xpath('//article[@class="_2rhmJa"]').get()
     subject = response.xpath(
         '//div[@class="_2Nttfz"]/a/span/text()').getall()
     # 此时为列表,在MySQL中不支持列表,以逗号分割
     subject = ",".join(subject)
     item = JianshuItem(title=title,
                        content=content,
                        article_id=article_id,
                        origin_url=origin_url,
                        author=author,
                        avatar=avatar,
                        pub_time=pub_time,
                        subject=subject)
     yield item
Example #10
0
 def parse_text(self,response):
    # print(response.text)
    item = JianshuItem()
    item['title'] = response.css("._gp-ck ._1RuRku::text").get()
    item['user_url'] = "https://www.jianshu_selenium.com"+response.css("._gp-ck ._1OhGeD::attr(href)").get()
    item['date'] = response.css("._gp-ck .s-dsoj time::text").get()
    item['dz']  = response.css("._gp-ck .s-dsoj span:last-child::text").get()
    item['text'] = "  ".join(response.css("._gp-ck ._2rhmJa p::text").getall())
    yield item
Example #11
0
    def parse(self, response):
        # 文章id,可从url获取
        article_id = response.url.split('/')[-1]
        # 标题
        title = response.xpath('//h1[@class="_1RuRku"]/text()').get()
        # 内容,这里把内容的标签也保存下来
        content = response.xpath('//article').get()
        # 作者
        author = response.xpath('//a[@class="_1OhGeD"]/text()').get()
        # 头像
        avatar = response.xpath('//img[@class="_13D2Eh"]/@src').get()
        # 发布时间
        pub_time = response.xpath('//div[@class="s-dsoj"]/time/text()').get()
        #字数和阅读量没有可供筛选的条件,并且他们前面有个span有些页面有有些页面没有,所以倒数着来取
        # 字数
        word_count = response.xpath(
            '//div[@class="s-dsoj"]/span[last()-1]/text()').get()
        word_count = word_count.split()[-1]
        # 阅读量
        read_count = response.xpath(
            '//div[@class="s-dsoj"]/span[last()]/text()').get()
        read_count = read_count.split()[-1]

        # 评论数,span中含有注释签<!---->,所以需要getall()才能获取到后面的数字
        comment_count = response.xpath(
            '//div[@class="-pXE92"]/div[1]/span//text()').getall()[-1]
        # 点赞数,没有点赞数的话没有任何数字,所以自己判断一下给它赋0
        like_count = response.xpath(
            '//div[@class="-pXE92"]/div[2]/span//text()').getall()
        if len(like_count) == 1:
            like_count = '0'
        else:
            like_count = like_count[-1]

        # 所属专题
        subjects = response.xpath(
            '//div[contains(@class, "_2Nttfz")]/a/span/text()').getall()
        # getall()返回的是一个列表,将专题列表转换成以逗号分隔的字符串。
        subjects = ','.join(subjects)

        # url
        origin_url = response.url

        item = JianshuItem(article_id=article_id,
                           title=title,
                           content=content,
                           author=author,
                           avatar=avatar,
                           pub_time=pub_time,
                           word_count=word_count,
                           read_count=read_count,
                           comment_count=comment_count,
                           like_count=like_count,
                           subjects=subjects,
                           origin_url=origin_url)

        yield item
Example #12
0
 def parse_item(self, response):
     html =etree.HTML(response.text)
     item = JianshuItem()
     item['title'] = html.xpath("//title/text()")[0].split("-")[0]
     item['name'] = html.xpath("//span[@class='name']/a/text()")[0]
     item['url'] = response.url.split("?")[0]
     collection = html.xpath("//div[@class='include-collection']/a/div[@class='name']/text()")
     if collection:
         item['collection'] = ','.join(collection)
     yield item
Example #13
0
    def parse(self, response):
        if self.page == 1:
            articles = response.selector.xpath('//ul[@class="note-list"]/li')
        else:
            articles = response.selector.xpath('//li[@class="have-img"]')

        for article in articles:
            note_id = article.xpath('@data-note-id').extract()
            if len(note_id) > 0:
                self.note_id_list.append(note_id[0])

            title = article.xpath('div/a[@class="title"]/text()').extract()
            article_abstract = article.xpath(
                'div/p[@class="abstract"]/text()').extract()
            article_link = article.xpath(
                'div/a[@class="title"]/@href').extract()
            author = article.xpath(
                'div/div/div/a[@class="nickname"]/text()').extract()
            author_link = article.xpath(
                'div/div/div/a[@class="nickname"]/@href').extract()
            post_time = article.xpath(
                'div/div/div/span/@data-shared-at').extract()
            category = article.xpath(
                'div/div/a[@class="collection-tag"]/text()').extract()
            meta_a = article.xpath('div/div/a/text()').re(r' ([0-9]*)\n')
            meta_span = article.xpath('div/div/span/text()').re(r' ([0-9]*)')
            item = JianshuItem()
            item['title'] = title[0]
            item['article_abstract'] = article_abstract[0]
            item['article_link'] = article_link[0]
            item['author'] = author[0]
            item['author_link'] = author_link[0]
            item['post_time'] = post_time[0]
            item['category'] = ''
            item['views'] = int(meta_a[0])
            item['comments'] = int(meta_a[1])
            item['like'] = int(meta_span[0])
            item['reward'] = 0
            if len(category) > 0:
                item['category'] = category[0]
            if len(meta_span) > 1:
                item['reward'] = int(meta_span[1])
            yield item

        #最多加载15页
        if self.page < 15:
            self.page = self.page + 1
            params = urllib.urlencode(
                {
                    "page": self.page,
                    "seen_snote_ids[]": self.note_id_list
                }, True)
            yield scrapy.Request("https://www.jianshu.com/?%s" % params,
                                 headers=self.headers,
                                 callback=self.parse)
Example #14
0
    def parse_item(self, response):
        item = {}
        # 获取内容页数据并解析数据



        item = JianshuItem(

        )

        return item
Example #15
0
    def parse(self, response):
        item = JianshuItem()
        selector = Selector(response)
        articles = selector.xpath('//ul[@class="note-list"]/li')

        for article in articles:
            title = article.xpath('div/a/text()').extract()
            url = article.xpath('div/a/@href').extract()
            author = article.xpath('div/div[1]/div/a/text()').extract()

            # 下载所有热门文章的缩略图, 注意有些文章没有图片
            try:  #/div/div[1]/a/img
                image = article.xpath("div/div[1]/a/img/@src").extract()[0]
                filename = 'images/%s-%s.jpg' % (author[0], title[0])
                print("文件名:" + filename)
                print("图片地址" + image)
                urllib.request.urlretrieve(image, filename)
            except:
                print('--no---image--')

            #//*[@id="note-9417518"]/div/div[2]/a[1],阅读数
            listtop = article.xpath('div/div[2]/a[1]/text()').extract()
            #
            likeNum = article.xpath('div/div[2]/span[1]/text()').extract()
            #//*[@id="note-9417518"]/div/div[2]/a[2]/i
            #//*[@id="note-9417518"]/div/div[2]/a[2]
            readAndComment = article.xpath('div/div[2]/a[2]/text()')

            test = readAndComment[1].extract()

            item['title'] = title
            item['url'] = 'http://www.jianshu.com/' + url[0]
            item['author'] = author

            item['readNum'] = listtop[1]

            # 有的文章是禁用了评论的
            try:
                item['commentNum'] = readAndComment[1].extract()
            except:
                item['commentNum'] = ''
            item['likeNum'] = likeNum
            yield item
#/html/body/div[1]/div/div[1]/a
#next_link = selector.xpath('//a')
#xpath(‘//div[contains(@id,”ma”)]‘)

        if len(articles) > 0:
            self.page = self.page + 1
            next_link = self.url + "?page=" + str(self.page)
            print("----" + next_link)
            yield Request(next_link, callback=self.parse)
Example #16
0
    def parse_item(self, response):
        title = response.xpath('//h1[@title]/text()').get()
        # 文章标题
        author = response.xpath(
            '//div[@class]/a[@href]/span[@class]/text()').get()
        # 文章作者

        # 用于存储文章内容
        x_content = response.xpath('//article//text()').getall()
        content = ' '.join(x_content)
        print(title, author, content)
        item = JianshuItem(title=title, author=author, content=content)
        yield item
Example #17
0
    def parse_item(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.jianshu.com/p/b851e04de659
        @returns items 1 16
        @scrapes  author content title url datetime wordnum views_count
        comments_count likes_count followers_count total_likes_count rank
        """

        item = JianshuItem()
        log.start(logfile='log.txt', loglevel=log.INFO)
        log.msg('RequestURL:%s' % response.url, spider=JSSpider)
        contents = response.xpath('//div[contains(@class, "preview")]')[0]
        item['title'] = contents.xpath(
            'h1[contains(@class,"title")]/text()').extract()[0]
        item['author'] = contents.xpath(
            'div/a[contains(@class,"author-name")]/span/text()').extract()[0]
        item['datetime'] = contents.xpath(
            'div[contains(@class,"author-info")]/span/text()').extract()[1]
        pagecons = response.xpath('//div[contains(@class, "show-content")]/p')
        item['content'] = pagecons.extract()
        item['url'] = response.url
        scriptlists = response.xpath(
            '//script[contains(@data-name,"note")]/text()').extract()
        scriptlist6 = scriptlists[0].strip().split(',')[-6:]
        newscripts = []
        for script in scriptlist6:
            newscripts += script.encode('utf8').split(':')
        newscript = [n.replace('"', '') for n in newscripts]
        newdict = dict(newscript[i:i + 2] for i in range(0, len(newscript), 2))
        item['wordnum'] = newdict.get('wordage')
        item['views_count'] = newdict.get('views_count')
        item['likes_count'] = newdict.get('likes_count')
        item['comments_count'] = newdict.get('comments_count')
        followersandtotallikes = response.xpath(
            '//script[contains(@data-name,"author")]/text()').extract()
        followersandtotallikes2 = followersandtotallikes[0].strip().split(
            ',')[-3:-1]
        newfollowersandtotallikes2 = []
        for followersandlikes in followersandtotallikes2:
            newfollowersandtotallikes2 += followersandlikes.encode(
                'utf8').split(':')
        followerslikes = [
            n.replace('"', '') for n in newfollowersandtotallikes2
        ]
        followerslikesdict = dict(followerslikes[i:i + 2]
                                  for i in range(0, len(followerslikes), 2))
        item['followers_count'] = followerslikesdict.get('followers_count')
        item['total_likes_count'] = followerslikesdict.get('total_likes_count')
        return item
Example #18
0
    def parse_html(self, response):
        loader = ArticleItem(item=JianshuItem(), response=response)
        loader.add_xpath('title', "//div[@class='_gp-ck']//h1/text()")
        loader.add_xpath('num', "//div[@class='s-dsoj']/span[2]/text()")
        loader.add_xpath('look', "//div[@class='s-dsoj']/span[3]/text()")
        loader.add_xpath('author', "//span[@class='_22gUMi']/text()")
        loader.add_xpath('favor', "//span[@class='_1LOh_5']/text()")
        loader.add_xpath('time', "//time/text()")
        loader.add_xpath('content', "//article[@class='_2rhmJa']//text()")
        loader.add_value('url', response.url)

        article_item = loader.load_item()

        yield article_item
Example #19
0
 def parse_item(self, response):
     item = JianshuItem()
     item['author_url'] = response.meta['author_url']
     item['author_name'] = response.meta['author_name']
     try:
         selector = Selector(response)
         fans = selector.xpath('//div[@class="info"]/ul/li[2]/div/a/p/text()').extract()[0]
         articles = selector.xpath('//div[@class="info"]/ul/li[3]/div/a/p/text()').extract()[0]
         word_count = selector.xpath('//div[@class="info"]/ul/li[4]/div/p/text()').extract()[0]
         item['fans'] = fans
         item['articles'] = articles
         item['word_count'] = word_count
         yield item
     except:
         pass
Example #20
0
    def parse(self, response):
        """
            response 是请求网页返回的数据
        """
        item = JianshuItem()
        selector = Selector(response)

        title = selector.xpath(
            '/html/body/div[4]/div/div[1]/div[2]/div[1]/h1/text()').extract(
            )[0]
        time = re.findall(
            r'\d{4}-\d{2}-\d{2}',
            selector.xpath(
                '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[1]/span[1]/text()'
            ).extract()[0])[0]
        price = selector.xpath(
            '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/span[2]/text()'
        ).extract()[0]
        style = selector.xpath(
            '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[2]/text()'
        ).extract()[0][4:]
        source = selector.xpath(
            '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[3]/text()'
        ).extract()[0][4:]
        house_type = selector.xpath(
            '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[4]/text()'
        ).extract()[0][4:]
        devices = selector.xpath(
            '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[5]/text()'
        ).extract()[0][4:]
        address = selector.xpath(
            '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[3]/div[1]/div[6]/text()'
        ).extract()[0][4:]
        description = selector.xpath(
            '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[4]/text()'
        ).extract()[0]

        item['title'] = title
        item['time'] = time
        item['rent'] = price
        item['rent_style'] = style
        item['source'] = source
        item['type'] = house_type
        item['devices'] = devices
        item['address'] = address
        item['text'] = description

        yield item
Example #21
0
    def parse(self, response):
        item = JianshuItem()
        selector = Selector(response)
        articles = selector.xpath('//ul[@class="note-list"]/li')

        for article in articles:
            title = article.xpath('div/a/text()').extract()
            url = article.xpath('div/a/@href').extract()

            # 下载所有热门文章的缩略图, 注意有些文章没有图片
            # try:
            #     image = article.xpath("a/img/@src").extract()
            #     urllib.urlretrieve(image[0], '/Users/apple/Documents/images/%s-%s.jpg' % (author[0], title[0]))
            # except:
            #     print('--no---image--')
            item['ccommentLimt'] = article.xpath('div/p/text()').extract()
            listtop = article.xpath('div/div/a/text()').extract()
            likeNum = article.xpath('div/div/span/text()').extract()

            author = article.xpath(
                'div/div/div/a[@class="nickname"]/text()').extract()
            # readAndComment = article.xpath('div/div[@class="list-footer"]')
            # data = readAndComment[0].xpath('string(.)').extract()[0]

            item['title'] = title
            item['url'] = 'http://www.jianshu.com/' + url[0]
            item['author'] = author

            item['readNum'] = listtop[3]
            # 有的文章是禁用了评论的
            try:
                item['commentNum'] = listtop[5]
            except:
                item['commentNum'] = ''
            item['likeNum'] = likeNum[0]
            try:
                item['moneyNum'] = likeNum[1]
            except:
                item['moneyNum'] = ''

            yield item

        next_link = selector.xpath(
            '//*[@id="list-container"]/div/button/@data-url').extract()

        if len(next_link) == 1:
            next_link = self.url + str(next_link[0])
            yield Request(next_link, callback=self.parse)
Example #22
0
 def parse(self, response):
     item = JianshuItem()
     # print 'responseInfo:'+response.body
     selector = scrapy.Selector(response)
     articles = selector.xpath('//ul[@class="note-list"]')
     # print 'selector:'+str(selector)
     print 'articles.count:' + str(len(articles))
     for article in articles:
         # print 'article:'+str(article)
         titles = article.xpath('//a[@class="title"]/text()').extract()
         print 'titles.count:' + str(len(titles))
         for title in titles:
             # title = title.xpath('///div/a/text()').extract()
             print 'title:' + title
             item['title'] = title
             yield item
Example #23
0
    def parse(self, response):
        item = JianshuItem()
        selector = Selector(response)

        # response就是返回的网页数据
        # 处理好的数据放在items中,在items.py设置好你要处理哪些数据字段,这里我们抓取文章标题,url,作者,阅读数,喜欢,打赏数
        # 解析处理数据的地方,用xpath解析处理数据
        # 简书不让爬
        # articles = selector.xpath('//ul[@class="note-list"]/li')
        # 试试CSDN博客页面
        # articles = selector.xpath('//ul[@id="feedlist_id"]/li')
        # 测试窝
        articles = selector.xpath('/html/body/div[1]/div/div[1]/div[2]/div')

        for article in articles:
            # 简书
            # title = article.xpath('/div/a/text()').extract()
            # url = article.xpath('/div/a/@href').extract()
            # author = article.xpath('/div/div/a/text()').extract()

            # CSDN
            # title = article.xpath('/div/div/h2/a/text()').extract()
            # url = article.xpath('/div/div/h2/a/@href').extract()
            # author = article.xpath('/div/dl/dd[4]/a/text()').extract()

            # 测试窝
            # 相对xpath千万不要以斜杠“/”开头,否则获取不到数据
            title = article.xpath('div[1]/div/h3/a/text()').extract()
            url = article.xpath('div/div/h3/a/@href').extract()
            # author = article.xpath('/div/dl/dd[4]/a/text()').extract()

            # #下载所有热门文章的缩略图,有些文章没有
            # try:
            #     image = article.xpath('/a/img/@src').extract()
            #     # urllib.urlretrieve(image[0], '/Users/apple/Documents/images/%s-%s.jpg' % (author[0], title[0]))
            # except:
            #     print 'NO IMG'

            # 喜欢数

            # 评论数

            item['title'] = title
            item['url'] = url
            # item['author'] = author

            yield item
Example #24
0
 def parse_item(self, response):
     title = response.xpath(
         "//div[@class='article']/h1[@class='title']/text()").get().strip()
     avatar = response.xpath(
         "//div[@class='article']/div[@class='author']/a/img/@src").get(
         ).strip()
     author = response.xpath(
         "//div[@class='article']//div[@class='info']/span/a/text()").get(
         ).strip()
     pub_time = response.xpath(
         "//div[@class='meta']/span[@class='publish-time']/text()").get(
         ).strip()
     origin_url = response.url
     # /p/7ba4ea51d56c?utm_campaign=maleskine&utm_content=note&utm_medium=seo_notes&utm_source=recommendation
     # /p/7ba4ea51d56c
     # url里面只能有一个? 然后取第一个位 在通过/分割取最后一位
     author_id = response.url.split('?')[0].split('/')[-1]
     content = response.xpath("//div[@class='show-content-free']").get()
     read_count = response.xpath(
         "//div[@class='meta']/span[@class='views-count']/text()").get(
         ).strip()
     like_count = response.xpath(
         "//div[@class='meta']/span[@class='likes-count']/text()").get(
         ).strip()
     word_count = response.xpath(
         "//div[@class='meta']/span[@class='wordage']/text()").get().strip(
         )
     subjects = ",".join(
         response.xpath(
             "//div[@class='include-collection']/a/div[@class='name']/text()"
         ).getall()).strip()
     comment_count = response.xpath(
         "//div[@class='meta']/span[@class='comments-count']/text()").get(
         ).strip()
     item = JianshuItem(title=title,
                        avatar=avatar,
                        author=author,
                        pub_time=pub_time,
                        origin_url=origin_url,
                        author_id=author_id,
                        content=content,
                        read_count=read_count,
                        like_count=like_count,
                        word_count=word_count,
                        subjects=subjects,
                        comment_count=comment_count)
     yield item
Example #25
0
    def parse(self, response):
        item = JianshuItem()
        selector = Selector(response)
        articles = selector.xpath('//ul[@class="article-list thumbnails"]/li')
        import pdb
        pdb.set_trace()
        for article in articles:
            title = article.xpath('div/h4/a/text()').extract()
            url = article.xpath('div/h4/a/@href').extract()
            author = article.xpath('div/p/a/text()').extract()

            # ÏÔËÓÈÃÎÕµÄõ¼, עÒÓЩÎÕûÓͼƬ
            try:
                image = article.xpath("a/img/@src").extract()
                urllib.urlretrieve(image[0],
                                   '/images/%s-%s.jpg' % (author[0], title[0]))
            except:
                print '--no---image--'

            listtop = article.xpath('div/div/a/text()').extract()
            likeNum = article.xpath('div/div/span/text()').extract()

            readAndComment = article.xpath('div/div[@class="list-footer"]')
            data = readAndComment[0].xpath('string(.)').extract()[0]

            item['title'] = title
            item['url'] = 'http://www.jianshu.com/' + url[0]
            item['author'] = author

            item['readNum'] = listtop[0]
            try:
                item['commentNum'] = listtop[1]
            except:
                item['commentNum'] = ''
            item['likeNum'] = likeNum
            yield item

        next_link = selector.xpath(
            '//*[@id="list-container"]/div/button/@data-url').extract()

        if len(next_link) == 1:

            next_link = self.url + str(next_link[0])
            print "----" + next_link
            yield Request(next_link,
                          callback=self.parse,
                          headers={'User-Agent': " ¡magic br"})
Example #26
0
 def parse_detail(self, response):
     item = JianshuItem()
     item['name'] = response.meta['name']
     item['img_url'] = response.meta['url']
     summer = response.xpath(
         '//*[@id="link-report"]/span[1]/text()').extract()[0]
     # //*[@id="link-report"]/span[1]
     # 查看全文
     # //*[@id="link-report"]/span[1]/span/text()
     title = response.xpath(
         '//*[@id="content"]/div[3]/div[1]/div[3]/h2/i/text()').extract()
     print(title)
     print(item['name'])
     print(response)
     print(summer)
     item['name2'] = summer
     yield item
Example #27
0
 def parse_detail(self, response):
     title = response.xpath("//h1[@class='title']/text()").get()
     avatar = response.xpath("//a[@class='avatar']/img/@src").get()
     author = response.xpath("//span[@class='name']/a/text()").get()
     pub_time = response.xpath("//span[@class='publish-time']/text()").get()
     url = response.url
     url1 = url.split("?")[0]
     article_id = url1.split("/")[-1]
     content = response.xpath("//div[@class='show-content']").get()
     item = JianshuItem(title=title,
                        avatar=avatar,
                        author=author,
                        pub_time=pub_time,
                        origin_url=response.url,
                        article_id=article_id,
                        content=content)
     yield item
Example #28
0
File: js.py Project: fkeway/jianshu
 def parse(self, response):
     item = JianshuItem()
     item['author_uid'] = response.xpath("//a[@class='name']/@href").get()
     item['author_url'] = response.url
     item['author'] = response.xpath("//a[@class='name']/text()").get()
     item['fans'] = response.xpath("//div[@class='info']/ul/li[2]//p/text()").get()
     item['concern'] = response.xpath("//div[@class='info']/ul/li[1]//p/text()").get()
     item['article'] = response.xpath("//div[@class='info']/ul/li[3]//p/text()").get()
     item['word_count'] = response.xpath("//div[@class='info']/ul/li[4]//p/text()").get()
     item['js_diamond'] = response.xpath("//div[@class='info']/ul/li[6]//p/text()").extract_first()
     item['tag'] = response.xpath("//div[@class='js-intro']/text()").extract_first()
     #print(item)
     yield item
     uid = item['author_url'].split('/')[-1]
     page = 1
     yield scrapy.Request(url=self.fans_url.format(uid,page), callback=self.parse_fans, meta={'page': page})
     yield scrapy.Request(url=self.concern_url.format(uid,page), callback=self.parse_concern, meta={'page': page})
 def parse(self, res):
     item = JianshuItem()
     articles = res.xpath('//ul[@class="note-list"]/li')
     for index,article in enumerate(articles):
         item['author'] = article.xpath('.//div[@class="info"]/a[1]/text()').extract()[0]
         item['title'] = article.xpath('.//div[@class="content"]/a[1]/text()').extract()[0]
         item['abstract'] = article.xpath('normalize-space(.//p[@class="abstract"]/text())').extract()[0]
         pulish_time = article.xpath('.//div[@class="info"]//span/@data-shared-at').extract()[0]
         item_url = article.xpath('.//div[@class="content"]/a/@href').extract()[0]
         item['item_url'] = 'http://www.jianshu.com' + str(item_url)
         read_number = article.xpath('.//div[@class="meta"]/a[1]/text()').extract()[1]
         comment_number = article.xpath('.//div[@class="meta"]/a[2]/text()').extract()[1]
         collect_number = article.xpath('.//div[@class="meta"]/span/text()').extract()[0]
         item['pulish_time'] = DealFunction().format_time(pulish_time)
         item['comment_number'] = re.sub(r'\s+','',read_number)
         item['read_number'] = re.sub(r'\s+','',read_number)
         item['collect_number'] = re.sub(r'\s+','',collect_number)
         yield item
Example #30
0
 def parse(self, response):
     item = JianshuItem()
     articles = response.xpath("//ul[@class='note-list']/li")
     for article in articles:
         item['author'] = article.xpath(
             './/div[@class="info"]/a/text()').extract()[0]
         item['title'] = article.xpath(
             './/div[@class="content"]/a/text()').extract()[0]
         item['times'] = article.xpath(
             './/div[@class="info"]/span/@data-shared-at').extract()[0]
         url = article.xpath(
             './/div[@class="content"]/a/@href').extract()[0]
         item['url'] = 'http://www.jianshu.com' + url
         admire = article.xpath('.//div/div[2]/span[2]/text()').extract()
         item['admire'] = ''.join(admire)
         likes = article.xpath('.//div/div[2]/span[1]/text()').extract()
         item['likes'] = ''.join(likes)
         yield item