コード例 #1
0
ファイル: joble.py プロジェクト: cshk/scrapy
    def parse_detail(self, response):
        # item = JobboleItem()
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first()
        # create_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace('·','')
        # content = response.xpath('//div[@class="entry"]').extract_first()
        # tag_lst = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tag_lst = [element for element in tag_lst]
        # tags = ",".join(tag_lst)
        #
        # item['url_obj_id'] = get_md5(response.url)
        # item['url'] = response.url
        # item['title'] = title
        # item['create_time'] = create_time
        # item['tags'] = tags
        # item['content'] = content

        item_loader = MyItemLoader(item=JobboleItem(), response=response)
        item_loader.add_xpath("title",
                              "//div[@class='entry-header']/h1/text()")
        item_loader.add_xpath(
            "create_time", '//p[@class="entry-meta-hide-on-mobile"]/text()')
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_obj_id", get_md5(response.url))
        item_loader.add_xpath(
            "tags", '//p[@class="entry-meta-hide-on-mobile"]/a/text()')
        item_loader.add_xpath("content", '//div[@class="entry"]')
        item = item_loader.load_item()
        yield item
コード例 #2
0
ファイル: Article.py プロジェクト: ColgateKas/PythonStudy
    def parse_detail(self, response):
        article_item = JobboleItem()
        #文章封面图地址
        front_image_url = response.meta.get("front_image_url", "")
        title = response.xpath(
            '//div[@class="entry-header"]/h1/text()').extract_first()
        create_date = response.xpath(
            '//p[@class="entry-meta-hide-on-mobile"]/text()').extract(
            )[0].strip().split()[0]

        tag_list = response.xpath(
            '//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        tag_list = [
            element for element in tag_list
            if not element.strip().endswith("评论")
        ]
        tag = ",".join(tag_list)
        praise_nums = response.xpath(
            '//span[contains(@class,"vote-post-up")]/h10/text()').extract()
        if len(praise_nums) == 0:
            praise_nums = 0
        else:
            praise_nums = int(praise_nums[0])
        fav_nums = response.xpath(
            '//span[contains(@class,"bookmark-btn")]/text()').extract()[0]
        match_re = re.match(".*(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0
        comment_nums = response.xpath(
            "//a[@href='#article-comment']/span/text()").extract()[0]
        match_com = re.match(".*(\d+).*", comment_nums)
        if match_com:
            comment_nums = int(match_com.group(1))
        else:
            comment_nums = 0

        content = response.xpath('//div[@class="entry"]').extract()[0]

        article_item["url_object_id"] = get_md5(response.url)  #这里对地址进行了md5变成定长
        article_item["title"] = title
        article_item["url"] = response.url
        try:
            create_date = datetime.datetime.strptime(create_date,
                                                     '%Y/%m/%d').date()
        except Exception as e:
            create_date = datetime.datetime.now().date()

        article_item["create_date"] = create_date
        article_item["front_image_url"] = [front_image_url]
        article_item["praise_nums"] = int(praise_nums)
        article_item["fav_nums"] = fav_nums
        article_item["comment_nums"] = comment_nums
        article_item["tag"] = tag
        article_item['content'] = content

        yield article_item
コード例 #3
0
ファイル: jobbbole.py プロジェクト: JemmyH/jobbole
 def parse_datail(self, response):
     title = response.xpath(
         "/html/body/div[1]/div[3]/div[1]/div[1]/h1/text()").extract()[0]
     date = response.xpath("/html/body/div[1]/div[3]/div[1]/div[2]/p/text()"
                           ).extract()[0].strip().replace(" ·", "")
     url = str(response.url)
     item = JobboleItem()
     item["title"] = title
     item["date"] = date
     item["url"] = url
     yield item
コード例 #4
0
    def parse_detail(self, response):
        jobbole_item = JobboleItem()

        front_image_url = response.meta.get("front_image_url", "")
        item_loader = JobBoleArticleItemLoader(item=JobboleItem(),
                                               response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
コード例 #5
0
 def parse(self, response):
     sel = Selector(response).xpath('//div[@class="post floated-thumb"]')
     for content in sel:
         item = JobboleItem()
         item['title'] = content.xpath(
             'div[2]/p[1]/a[1]/text()').extract_first()
         item['url'] = content.xpath(
             'div[2]/p[1]/a[1]/@href').extract_first()
         if item['url']:
             yield Request(item['url'],
                           callback=self.get_content,
                           meta={'item': item})
             yield item
コード例 #6
0
    def parse(self, response):

        jobs = response.xpath(
            '// *[ @ id = "archive"]/div[@class="post floated-thumb"]')
        for j in jobs:
            item = JobboleItem()
            item['title'] = j.xpath('./div[2]/p[1]/a[1]/text()').extract()
            yield item

        next_page = response.xpath(
            '//*[@id="archive"]/div[21]/a[4]/@href').extract()[0]  #下一页链接
        if next_page is not None:  # 判断是否存在下一页
            next_page = response.urljoin(next_page)
            yield scrapy.http.Request(next_page,
                                      callback=self.parse,
                                      dont_filter=True)  #提交给parse继续抓取下一页
コード例 #7
0
ファイル: jobbole_spider.py プロジェクト: isysc1/boleSpider
 def jobbole_parse(self, response):
     date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()'
                           ).getall()[0].strip()
     publish_time = re.sub('·', '', date)
     category = ''.join(
         response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').
         getall()[0])
     origin_link = response.xpath(
         '//div[@class="copyright-area"]/a/@href').get()
     origin_author = response.xpath(
         '//div[@class="copyright-area"]/a/text()').get()
     content = response.xpath('////div[@class="entry"]').getall()
     title = response.xpath('//div[@class="entry-header"]/h1/text()').get()
     item = JobboleItem(
         publish_time=publish_time,
         category=category,
         origin_link=origin_link,
         title=title,
         origin_author=origin_author,
         content=content,
     )
     yield item
コード例 #8
0
 def parseJobDetail(self, response):
     print(response.status)
     item = JobboleItem()
     item['title'] = response.xpath(
         '//div[@class="grid-8"]/div/div[@class="entry-header"]/h1/text()'
     )[0].extract()
     item['creation_time'] = response.xpath(
         './/p[@class="entry-meta-hide-on-mobile"]/text()')[0].extract()
     item['article_addresses'] = response.url
     item['image_links'] = response.xpath(
         '//*[@class="entry"]//img/@src').extract()
     item['praise_num'] = response.xpath(
         '//div[@class="post-adds"]/span[1]/h10/text()').extract()
     item['collect_num'] = response.xpath(
         '//div[@class="post-adds"]/span[2]/text()').extract()
     item['comment_num'] = response.xpath(
         '//span[@class="btn-bluet-bigger href-style hide-on-480"]/text()'
     ).extract()
     item['centent'] = response.xpath(
         "//div[@class='entry']//p/text()").extract()
     item['label'] = response.xpath(
         '//div[@class="entry-meta"]/p/a[3]/text()').extract()
     yield item
コード例 #9
0
ファイル: Jobbole.py プロジェクト: aini626204777/spider
 def parse_content(self,response):
     print('开始匹配')
     item = JobboleItem()
     # 标题
     item['title'] = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
     # 创建时间
     item['create_data'] = response.xpath('//div[@class="entry-meta"]/p/text()').extract()[0]
     # 文章地址
     item['url'] = response.url
     # 图片链接地址
     item['img_url'] = response.xpath('//img[@class="aligncenter"][1]/@src').extract_first('')
     # # 点赞数
     item['praise_nums'] = response.xpath('//div[@class="post-adds"]/span[1]/h10/text()').extract()[0]
     # print(praise_nums)
     # 收藏数量
     item['bookmark_nums'] = response.xpath('//span[@class=" btn-bluet-bigger href-style bookmark-btn  register-user-only "]/text()').extract_first('0')[0]
     # 评论数量
     item['comment_nums'] = response.xpath('//div[@class="post-adds"]/a/span/h10/text()').extract_first('0')[0]
     # 文章内容
     item['content'] =response.xpath('//div[@class="entry"]//p/text()').extract()
     # print(content)
     # # 标签
     item['tags'] =response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
     yield item
コード例 #10
0
    def parse_detail(self, response):
        """提取文章内的字段"""
        article_item = JobboleItem()

        # ---------------------------------通过xpath选择器提取字段--------------------------------------
        # title = response.xpath('//*[@id="post-114690"]/div[1]/h1/text()').extract_first()
        # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace(
        #     '·', '').strip()
        # praise_number = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract_first()
        #
        # favorite_numbers = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract_first()  # 返回'2 收藏'
        # match_re = re.match(r'.*(\d+).*', favorite_numbers)
        # if match_re:
        #     favorite_numbers = int(match_re.group(1))
        # else:
        #     favorite_numbers = 0
        #
        # comment_numbers = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first()
        # match_re = re.match(r'.*(\d+).*', comment_numbers)
        # if match_re:
        #     comment_numbers = int(match_re.group(1))
        # else:
        #     comment_numbers = 0
        # content = response.xpath('//div[@class="entry"]').extract_first()
        #
        # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()  # 获取文章标签
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]  # 过滤掉评论
        # tags = ','.join(tag_list)  # 将list元素转换成str

        # ---------------------------------通过css选择器提取字段--------------------------------------
        front_image_url = response.meta.get('front_image_url', '')  # 文章封面图
        title = response.css('.entry-header h1::text').extract_first()
        create_date = response.css('.entry-meta-hide-on-mobile::text'
                                   ).extract_first().strip().replace(
                                       '·', '').strip()
        praise_numbers = response.css(
            '.vote-post-up h10::text').extract_first()
        favorite_numbers = response.css('.bookmark-btn::text').extract_first()
        match_re = re.match(r'.*(\d+).*', favorite_numbers)
        if match_re:
            favorite_numbers = int(match_re.group(1))
        else:
            favorite_numbers = 0

        comment_numbers = response.css(
            'a[href="#article-comment"] span::text').extract_first()
        match_re = re.match(r'.*(\d+).*', comment_numbers)
        if match_re:
            comment_numbers = int(match_re.group(1))
        else:
            comment_numbers = 0

        content = response.css('div.entry').extract()
        tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract()
        tag_list = [
            element for element in tag_list
            if not element.strip().endswith("评论")
        ]  # 过滤掉评论
        tags = ','.join(tag_list)  # 将list元素转换成str

        article_item['url_object_id'] = get_md5(response.url)  # 将url改为md
        article_item['title'] = title
        article_item['create_date'] = create_date
        article_item['url'] = response.url
        article_item['front_image_url'] = [front_image_url
                                           ]  # 图片下载地址scrapy接受的参数是list类型
        article_item['praise_numbers'] = praise_numbers
        article_item['comment_numbers'] = comment_numbers
        article_item['favorite_numbers'] = favorite_numbers
        article_item['tags'] = tags
        article_item['content'] = content

        yield article_item
コード例 #11
0
ファイル: Jobbole.py プロジェクト: buppter/JobboleScrapy
    def parse_detail(self, response):
        '''
        article_item = JobboleItem()
        # 提取文章的具体字段

        # 文章图片
        front_image_url = response.meta.get("front_image_url", "")

        #文章标题
        title = response.css('.entry-header h1::text').extract_first()
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first()

        # 创建时间
        create_time = response.css('.entry-meta-hide-on-mobile::text').extract_first().strip().replace(' ·', '')
        # create_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', '')

        # 点赞数量
        vote_nums = response.css('.vote-post-up h10::text').extract_first()
        # vote_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract_first()
        if vote_nums:
            vote_nums = int(vote_nums)
        else:
            vote_nums = 0

        # 收藏数量
        mark_nums = response.css('.bookmark-btn::text').extract_first()
        # mark_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract_first()
        match_re = re.match('.*?(\d+).*?', mark_nums)
        if match_re:
            mark_nums = int(match_re.group(1))
        else:
            mark_nums = 0

        # 评论数量
        comment_nums = response.css('.btn-bluet-bigger.href-style.hide-on-480::text').extract_first()
        # comment_nums = response.xpath('//span[@class="btn-bluet-bigger href-style hide-on-480"]/text()').extract_first()
        match_re = re.match('.*?(\d+).*?', comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        # 文章内容,这里只提取HTML
        connent = response.css('.entry').extract_first()
        # content = response.xpath('//div[@class="entry"]').extract_first()

        # 标签
        tags_list = response.css('.entry-meta-hide-on-mobile a::text').extract()
        # tags = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        tags_list = [element for element in tags_list if not element.strip().endswith("评论")]
        tags = ','.join(tags_list)

        article_item["title"] = title
        article_item["url"] = response.url
        try:
            create_time = datetime.datetime.strptime(create_time, '%Y/%m/%s').date()
        except Exception as e:
            create_time = datetime.datetime.now().date()
        article_item["create_time"] = create_time
        article_item["vote_nums"] = vote_nums
        article_item["mark_nums"] = mark_nums
        article_item["comment_nums"] = comment_nums
        article_item["content"] = connent
        article_item["tags"] = tags
        article_item["front_image_url"] = [front_image_url]
        article_item["url_object_id"] = get_md5(response.url)
        '''
        # 通过itemloader加载item
        front_image_url = response.meta.get("front_image_url", "")
        item_loader = ArticleItemLoader(item=JobboleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        # item_loader.add_xpath()
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_css('create_time', '.entry-meta-hide-on-mobile::text')
        item_loader.add_css('vote_nums', '.vote-post-up h10::text')
        item_loader.add_css('mark_nums', '.bookmark-btn::text')
        item_loader.add_css('comment_nums', '.btn-bluet-bigger.href-style.hide-on-480::text')
        item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text')
        item_loader.add_css('content', '.entry')
        article_item = item_loader.load_item()
        yield article_item
コード例 #12
0
    def parse_detail(self, response):
        article_item = JobboleItem()

        # 提取文章的具体字段
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip()
        # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        #
        # content = response.xpath("//div[@class='entry']").extract()[0]
        #
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)

        # 通过css选择器提取字段
        # front_image_url = response.meta.get("front_image_url", "")  #文章封面图
        # title = response.css(".entry-header h1::text").extract()[0]
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip()
        # praise_nums = response.css(".vote-post-up h10::text").extract()[0]
        # fav_nums = response.css(".bookmark-btn::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.css("div.entry").extract()[0]
        #
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item['title'] = title
        # article_item['url'] = response.url
        # # 把字符串日期转换为date对象
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [front_image_url]
        # article_item['praise_nums'] = int(praise_nums)
        # article_item['comment_nums'] = comment_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['tags'] = tags
        # article_item['content'] = content
        # print(article_item)

        # 通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        # ArticleItemLoader是自定义的itemloader,只取数组第一个
        item_loader = ArticleItemLoader(item=JobboleItem(), response=response)
        # item_loader = ItemLoader(item=JobboleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item