コード例 #1
0
ファイル: jobbole.py プロジェクト: zero6996/ArticleSpider
    def parse_detail(self, response):
        article_item = JobboleArticleItem()

        #通过css选择器提取数据
        front_image_url = response.meta.get('front_image_url', "")  # 获取文章封面图
        #通过ItemLoader加载Item
        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)
        item_loader.add_css("title", '.entry-header>h1::text')
        item_loader.add_value("url", response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text')
        item_loader.add_value('front_image_url', [front_image_url])

        if response.css('.vote-post-up>h10::text'):
            item_loader.add_css("praise_number", '.vote-post-up>h10::text')
        else:
            item_loader.add_value("praise_number", "0")

        item_loader.add_css("comment_nums",
                            'a[href="#article-comment"]>span::text')
        item_loader.add_css('fav_nums', '.bookmark-btn::text')
        item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile>a::text')
        item_loader.add_css('content', 'div.entry')

        article_item = item_loader.load_item()

        yield article_item
コード例 #2
0
ファイル: jobbole.py プロジェクト: PantaSun/ScrapyPractice
    def parse_detail(self, response):
        """通过css选择器提取页面内容"""
        article_item = JobboleArticleItem()
        # title = response.css(".entry-header h1::text").extract()[0]
        # front_img_url = response.meta.get("font_img_url", "")
        # create_date = response.css(".entry-meta p::text").extract()[0].replace("·", "").strip()
        # praise_nums = response.css(".post-adds span h10::text").extract()[0]
        # fav_nums = response.css(".bookmark-btn::text").extract()[0].strip()
        # match_re = re.match(".*?(\d+).*?", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re = re.match(".*?(\d+).*?", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.css(".entry").extract()[0]
        # tag_list = response.css(".entry-meta a::text").extract()
        # tags = ','.join(tag_list)
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d")
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()

        # article_item['title'] = title
        # article_item['url'] = response.url
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['front_img_url'] = [front_img_url]
        # article_item['create_date'] = create_date
        # article_item['praise_nums'] = praise_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['comment_nums'] = comment_nums
        # article_item['content'] = content
        # article_item['tags'] = tags

        # 通过 itemloader 加载item
        front_img_url = response.meta.get("font_img_url", "")
        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)
        # item_loader = ItemLoader(item=JobboleArticleItem(), response=response)
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_css('create_date', '.entry-meta p::text')
        item_loader.add_css('praise_nums', '.post-adds span h10::text')
        item_loader.add_css('fav_nums', '.bookmark-btn::text')
        item_loader.add_css('comment_nums',
                            'a[href="#article-comment"] span::text')
        item_loader.add_css('tags', '.entry-meta p a::text')
        item_loader.add_css('content', '.entry')
        # item_loader.add_xpath()
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_value('front_img_url', [front_img_url])
        article_item = item_loader.load_item()

        yield article_item
コード例 #3
0
    def parse_detail(self, response):
        """
        爬取数据
        """
        # front_image_url = response.meta.get("font_image_url", "")
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first("")  # 标题
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "").strip()    # 日期
        # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]    # 点赞数
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]   # 收藏人数
        # match_re = re.match(".*?(\d+).*", fav_nums)
        #
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # 评论人数
        # match_re = re.match(".*?(\d+).*]", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.xpath("//div[@class='entry']").extract()[0]  # 日期
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list= [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ','.join(tag_list)   # 标签
        #
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.strftime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.now().date()
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        article_item = JobboleArticleItem()
        # 通过itemload加载item
        front_image_url = response.meta.get("font_image_url", "")
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response)
        item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_xpath("create_date", "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_xpath("praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()")
        item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath("fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()")
        item_loader.add_xpath("content", "//div[@class='entry']")
        item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        article_item= item_loader.load_item()
        yield article_item
コード例 #4
0
ファイル: jobbole.py プロジェクト: A1xuewuya/py
    def parse_detail(self, response):
        # 提取文章具体字段
        # xpath选择器
        # //*[@id="post-114228"]/div[1]/h1
        # res_selector = response.xpath('//*[@id="post-114228"]/div[1]/h1/text()')
        # selector_data = res_selector.extract()
        article_title = response.xpath(
            "//div[@class='entry-header']/h1/text()").extract()[0]
        publish_time = response.xpath(
            "//p[@class='entry-meta-hide-on-mobile']/text()").extract(
            )[0].strip().replace("·", "").strip()
        tag_list = response.xpath(
            "//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        article_content = response.xaaapath(
            "//div[@class='entry']").extract_first()
        star_num = response.xpath(
            "//div[@class='post-adds']//h10/text()").extract_first()
        bookmark_data = response.xpath(
            "//span[contains(@class, 'bookmark-btn')]/text()").extract_first()
        bookmark_re = re.match(".*?(\d+).*", bookmark_data)
        if bookmark_re:
            bookmark_num = int(bookmark_re.group(1))
        else:
            bookmark_num = 0
        comment_data = response.xpath(
            "//a[@href='#article-comment']/span/text()").extract()[0]
        comment_re = re.match(".*?(\d+).*", comment_data)
        if comment_re:
            comment_num = int(comment_re.group(1))
        else:
            comment_num = 0

        # 添加到数据关系映射当中
        article_item = JobboleArticleItem()
コード例 #5
0
    def parse_detail(self, response):
        match_re = re.match(".*?(\d+)", response.url)
        if match_re:
            # 正则表达式中的第一项符合要求的
            post_id = match_re.group(1)
            article_item = JobboleArticleItem()
            title = response.css("#news_title a::text").extract_first("")
            create_date = response.css("#news_info .time::text").extract_first(
                "")
            match_re = re.match(".*?(\d+.*)", create_date)
            if match_re:
                create_date = match_re.group(1)
            content = response.css("#news_content").extract()[0]
            tag_list = response.css(".news_tags a::text").extract()
            tags = ",".join(tag_list)

            article_item["title"] = title
            article_item["create_date"] = create_date
            article_item["content"] = content
            article_item["tags"] = tags
            article_item["url"] = response.url
            # 传递给下载的url一定要是以列表的形式
            if response.meta.get("front_image_url", " "):
                article_item["front_image_url"] = [
                    response.meta.get("front_image_url", "")
                ]
            else:
                article_item["front_image_url"] = []

            # 将article_item作为meta传递给parse_news_info方法
            yield Request(url=parse.urljoin(
                response.url,
                "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
                          meta={"article_item": article_item},
                          callback=self.parse_news_info)
コード例 #6
0
    def parse_detail(self, response):
        article_item = JobboleArticleItem()
        
        remote_img_url = response.meta.get('remote_img_url', '')
        title = response.css(".entry-header h1::text").extract()[0]
        publish_date = response.css('.entry-meta-hide-on-mobile::text').extract_first('')[0:-3].strip()
        praise_nums = response.css('.vote-post-up  h10::text').extract_first(0)
        fav_nums = response.css('.bookmark-btn::text').extract_first('').replace('收藏', '').strip()
        fav_nums = fav_nums if fav_nums != '' else 0
        comment_nums = response.css("a[href='#article-comment'] span::text").extract_first('').replace('评论', '').strip()
        comment_nums = comment_nums if comment_nums != '' else 0
        content = response.css('.entry').extract()[0]

        tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract()

        tag_list = [element for element in tag_list if not element.strip().endswith('评论')]

        tags = '-'.join(tag_list)

        article_item['remote_img_url'] = [remote_img_url]
        article_item['title'] = title
        article_item['publish_date'] = publish_date
        article_item['praise_nums'] = praise_nums
        article_item['fav_nums'] = fav_nums
        article_item['comment_nums'] = comment_nums
        article_item['content'] = content
        article_item['tags'] = tags



        yield article_item
コード例 #7
0
    def parse_detail(self, response):
        # title = response.css('div.entry-header h1::text').extract_first()
        # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first().replace('·', '').strip()
        # tags = response.css('p.entry-meta-hide-on-mobile a::text').extract()
        # tags = [tag for tag in tags if not tag.strip().endswith('评论')]
        # tag = '.'.join(tags)
        #
        # if response.css('div.post-adds span h10::text').extract_first():
        #     vote_num = int(response.css('div.post-adds span h10::text').extract_first())
        # else:
        #     vote_num = 0
        # collect_num = response.css('div.post-adds span.bookmark-btn::text').extract_first()
        # if re.match('.*(\d+).*', collect_num):
        #     collect_num = re.match('.*(\d+).*', collect_num).group(1)
        # else:
        #     collect_num = 0
        #
        # comment_num = response.css('a[href="#article-comment"] span::text').extract_first()
        # if re.match('.*(\d+).*', comment_num):
        #     comment_num = re.match('.*(\d+).*', comment_num).group(1)
        # else:
        #     comment_num = 0
        # content = response.css('div.entry').extract_first()
        # item = JobboleArticleItem()
        # item["title"] = title
        # try:
        #     create_date = datetime.datetime.strftime(create_date, '%Y-%m-%d')
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # item["create_date"] = create_date
        # item["url"] = response.url
        # item["url_id"] = hashlib.md5(response.url.encode(encoding='utf-8')).hexdigest()
        # item["tag"] = tag
        # item["vote_num"] = vote_num
        # item["collect_num"] = collect_num
        # item["comment_num"] = comment_num
        # item["content"] = content
        # item["front_image_url"] = [img_url]
        img_url = response.meta.get("front_img_url")
        item_loader = MyItemLoader(item=JobboleArticleItem(),
                                   response=response)
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_css("title", "div.entry-header h1::text")
        item_loader.add_css("tag", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("vote_num", "div.post-adds span h10::text")
        item_loader.add_css("collect_num",
                            "div.post-adds span.bookmark-btn::text")
        item_loader.add_css("comment_num",
                            'a[href="#article-comment"] span::text')
        item_loader.add_css("content", "div.entry")
        item_loader.add_value("front_image_url", [img_url])
        item_loader.add_value("url", response.url)
        item_loader.add_value(
            "url_id",
            hashlib.md5(response.url.encode(encoding='utf-8')).hexdigest())
        item = item_loader.load_item()

        yield item
        pass
コード例 #8
0
ファイル: jobbole.py プロジェクト: LouisYZK/dist
    def parse_article(self, response):
        # title = response.css(".entry-header h1::text").extract()[0]  CSS-selector写法

        # time = response.xpath('//*[@id="post-114638"]/div[2]/p/text()').extract()[0].strip().replace(' ·','')
        # praise_num = int(response.xpath('//*[@id="post-114638"]/div[3]/div[5]/span[1]/h10/text()').extract()[0])
        # favor_num = response.xpath('//*[@id="post-114638"]/div[3]/div[5]/span[2]/text()').extract()[0].strip()
        # match_info = re.match(r'.*(\d+).*', favor_num)
        # if match_info:
        #     favor_num = match_info.group(1)
        # comment_num = response.xpath('//*[@id="post-114638"]/div[3]/div[5]/a/span/text()').extract()[0].strip()
        # match_info = re.match(r'.*(\d+).*', favor_num)
        # if match_info:
        #     comment_num - match_info.group(1)
        # tags = response.xpath('//*[@id="post-114638"]/div[2]/p/a/text()').extract()
        # tags = ','.join(tags)
        # print(tags)
        title = response.xpath(
            "//*[@class='entry-header']/h1/text()").extract()[0].strip()

        time = response.css(
            ".entry-meta-hide-on-mobile::text")[0].extract().strip().replace(
                ' ·', '')
        try:
            create_date = datetime.datetime.strptime(time, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()

        tags = response.css(".entry-meta-hide-on-mobile a::text").extract()
        for tag in tags:
            if not re.match(r'\D.*', tag.strip()):
                tags.remove(tag)
        tags = ','.join(tags)

        article_data = response.css(".post-adds span")
        try:
            praise_num = self.extratc_num(
                article_data[0].css("h10::text").extract()[0])
            favor_num = self.extratc_num(
                response.css(".post-adds span::text").extract()[2])
            comment_num = self.extratc_num(
                response.css(".post-adds span::text").extract()[3])
        except Exception as e:
            praise_num, favor_num, comment_num = None, None, None
            print(e)

        article_item = JobboleArticleItem()
        article_item['title'] = title
        article_item['create_date'] = create_date
        article_item['tags'] = tags
        article_item['url'] = response.url
        article_item['url_object_id'] = get_md5(response.url)
        article_item['praise_num'] = praise_num
        article_item['favor_num'] = favor_num
        article_item['comment_num'] = comment_num

        yield article_item  # Very Important!!
コード例 #9
0
    def parse_detail(self, response):
        # article_item = JobboleArticleItem()

        # 提取文章的具体字段
        # front_image_url = response.meta.get("front_image_url", "")     # 文章封面图
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first()
        # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', '')
        # fav_nums = response.xpath('//div[@class="post-adds"]/span[2]/h10/text()').extract_first()
        # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first()
        # content = response.xpath('//div[@class="entry"]').extract()[0]
        # tag_list = response.xpath('//div[@class="entry-meta"]/p/a/text()').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith('评论')]
        # tags = ','.join(tag_list)
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['title'] = title
        # article_item['url'] = response.url
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [front_image_url]
        # article_item['comment_nums'] = comment_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['tags'] = tags
        # article_item['content'] = content

        # 通过ItemLoader加载item
        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)
        item_loader.add_xpath('title',
                              '//div[@class="entry-header"]/h1/text()')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_xpath(
            'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()')

        front_image_url = response.meta.get("front_image_url", "")
        item_loader.add_value('front_image_url', [front_image_url])

        item_loader.add_xpath('comment_nums',
                              '//a[@href="#article-comment"]/span/text()')

        fav_nums = response.xpath(
            '//div[@class="post-adds"]/span[2]/h10/text()').extract_first()

        if fav_nums is None:
            fav_nums = '0'
        item_loader.add_value('fav_nums', fav_nums)

        item_loader.add_xpath('tags', '//div[@class="entry-meta"]/p/a/text()')
        item_loader.add_xpath('content', '//div[@class="entry"]')

        article_item = item_loader.load_item()

        yield article_item
コード例 #10
0
    def parse_detail(self, response):

        jobboleItem = JobboleArticleItem()

        jobboleItem["title"] = response.xpath(
            '//*[@class="entry-header"]/h1/text()')[0].extract()
        jobboleItem["time"] = response.xpath(
            '//*[@class="entry-meta-hide-on-mobile"]/text()').extract(
            )[0].replace('·', '').strip()
        jobboleItem["vote"] = response.xpath(
            '//*[@class="post-adds"]/span[1]/h10/text()').extract()[0]
        jobboleItem["url"] = response.url
        jobboleItem["img_url"] = response.meta["front_img_url"]
        # print("title = \"" + jobboleItem["title"] + "\"  time=\"" + jobboleItem["time"] + "\"\n")
        yield jobboleItem
コード例 #11
0
    def parse_detail(self, response):
        article_item = JobboleArticleItem()

        # 提取文章的具体字段
        title = response.css(".entry-header h1::text").extract()
        create_date = response.css(
            "p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace(
                "·", "").strip()
        praise_nums = response.css("span.vote-post-up h10::text").extract()[0]
        front_img_url = response.meta.get("front_img_url", "")  # 文章封面图

        fav_nums = response.css(".bookmark-btn::text").extract()[0]
        match_re = re.match(".*(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.css(
            "a[href='#article-comment'] span").extract()[0]
        match_re = re.match(".*(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        content = response.css(".entry").extract()

        tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract()
        tag_list = [
            element for element in tag_list
            if not element.strip().endswith("评论")
        ]
        tags = ",".join(tag_list)

        article_item["title"] = title
        article_item["url"] = response.url
        article_item["url_object_id"] = get_md5(response.url)
        article_item["create_date"] = create_date
        article_item["front_img_url"] = [front_img_url]
        article_item["praise_nums"] = praise_nums
        article_item["fav_nums"] = fav_nums
        article_item["comment_nums"] = comment_nums
        article_item["tags"] = tags
        article_item["content"] = content

        yield article_item
コード例 #12
0
    def parse_detail(self, response):
        # 通过item loader加载item
        item_loader = TakeFirstItemLoader(item=JobboleArticleItem(), response=response)
        item_loader.add_value("table_name", "jobbole_article")
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url", [response.meta.get("front_image_url", "")])
        item_loader.add_css("create_date", ".entry-meta-hide-on-mobile::text")
        item_loader.add_css("praise_nums", "span.vote-post-up h10::text")
        item_loader.add_css("fav_nums", "span.bookmark-btn::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("tag", ".entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", ".entry")

        article_item = item_loader.load_item()
        yield article_item
コード例 #13
0
ファイル: jobbole.py プロジェクト: zybin2756/Article_Spider
 def parse_article(self, response):
     # 解析文章
     img_url = response.meta.get("img_url", "")
     itemLoader = mArticleItemLoader(item=JobboleArticleItem(),
                                     response=response)
     itemLoader.add_css("title", ".entry-header h1::text")
     itemLoader.add_css("create_time", ".entry-meta-hide-on-mobile::text")
     itemLoader.add_css("mark_nums", ".bookmark-btn::text")
     itemLoader.add_css("comment_nums",
                        ".btn-bluet-bigger.href-style.hide-on-480::text")
     itemLoader.add_css("vote_nums", ".vote-post-up h10::text")
     itemLoader.add_css("content", ".entry")
     itemLoader.add_value("crawl_time", datetime.now().strftime("%Y/%m/%d"))
     itemLoader.add_value("url", response.url)
     itemLoader.add_value("img_url", [img_url])
     itemLoader.add_value("object_id", get_md5(response.url))
     item = itemLoader.load_item()
     yield item
コード例 #14
0
    def parse_detail(self, response):
        match_re = re.match(".*?(\d+)", response.url)

        if match_re:
            article_item = JobboleArticleItem()

            title = response.xpath(
                '//div[@id="news_title"]/a/text()').extract_first("")
            create_time = response.xpath(
                '//div[@id="news_info"]/span[@class="time"]/text()'
            ).extract_first("")
            content = response.xpath('//div[@id="news_body"]').extract()[0]
            tags = ",".join(
                response.xpath(
                    '//div[@id="news_more_info"]/div[@class="news_tags"]/a/text()'
                ).extract())

            post_id = match_re.group(1)
            info_url = parse.urljoin(
                response.url,
                "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))

            article_item["title"] = title
            article_item["create_date"] = create_time
            article_item["content"] = content
            article_item["tags"] = tags
            article_item["url"] = response.url
            article_item["url_object_id"] = common.get_md5(response.url)
            article_item["front_img_url"] = [
                response.meta.get("front_img_url", "")
            ]

            yield Request(url=info_url,
                          meta={
                              "article_item": article_item,
                              "url": response.url
                          },
                          callback=self.parse_num)
コード例 #15
0
    def parse_detail(self, response):
        article_item = JobboleArticleItem()

        # 提取文章的具体字段

        # 通过xpath提取文章具体字段
        """
        title = response.xpath('//*[@id="post-112051"]/div[1]/h1/text()').extract_first("") #extract_first("")提取不到,返回为空

        create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace(" ·","")

        praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]

        fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        match_re = re.match(r".*?(\d+).*", fav_nums)
        if match_re :
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first("")
        match_re = re.match(r".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0 #没有评论

        content = response.xpath("//div[@class='entry']").extract()[0]

        tags = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        tag_list = [elment for elment in tags if not elment.strip().endswith("评论")]
        tags = ",".join(tag_list)

        print(title, create_date, praise_nums, comment_nums, tags)
        """
        """
        # 通过css选择器提取字段
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图

        title = response.css(".entry-header h1::text").extract_first()

        create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace(" ·","")

        praise_nums = response.css(".vote-post-up h10::text").extract()[0]

        fav_nums = response.css("span.bookmark-btn::text").extract()[0]
        match_re = re.match(r".*?(\d+).*", fav_nums)

        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        match_re = re.match(r".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0 # 没有评论

        content = response.css("div.entry").extract()[0]

        tags = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        tag_list = [elment for elment in tags if not elment.strip().endswith("评论")]
        tags = ",".join(tag_list)

        print(title, create_date, praise_nums, comment_nums, tags)


        
        #   给Item填充值
        article_item["title"] = title
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        except Exception as error:
            create_date = datetime.datetime.now().date()
        article_item["create_date"] = create_date
        article_item["url"] = response.url
        article_item["url_object_id"] = get_md5(response.url)
        article_item["front_image_url"] = [front_image_url]  # 改为数组
        article_item["praise_nums"] = praise_nums
        article_item["comment_nums"] = comment_nums
        article_item["fav_nums"] = fav_nums
        article_item["tags"] = tags
        article_item["content"] = content
        """
        """
        通过item Loader来加载Item  ----> 在以后的开发中都是用ItemLoader来解析值
        """
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)

        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", "span.bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()
        """
        传递到pipeline类里面去
        """
        yield article_item
コード例 #16
0
    def parse_detail(self, response):
        # 提取文章的具体字段
        # article_item = JobboleArticleItem()
        '''
        # xpath
        title = response.xpath('//*[@id="post-114159"]/div[1]/h1/text()').extract()[0]
        create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().strip().replace("·", "").strip()
        praise_nums = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0])
        fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        match_re = re.match(r".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0
        comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        match_re = re.match(r".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0
        content = response.xpath("//div[@class='entry']").extract()[0]
        tags_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        tags_list = [element for element in tags_list if not element.strip().endswith("评论")]
        tags = ','.join(tags_list)
        '''
        '''
        # css
        front_image_url = response.meta.get("front_image_url", "") # 封面图,用get不会抛异常
        title = response.css(".entry-header h1::text").extract()[0]
        create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().strip().replace("·", "").strip()
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        praise_nums = int(response.css(".vote-post-up h10::text").extract()[0])
        fav_nums = response.css(".bookmark-btn::text").extract()[0]
        match_re = re.match(r".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0
        comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        match_re = re.match(r".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0
        content = response.css("div.entry").extract()[0]
        tags_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        tags_list = [element for element in tags_list if not element.strip().endswith("评论")]
        tags = ','.join(tags_list)

        article_item["url_object_id"] = get_md5(response.url)
        article_item["title"] = title
        article_item["url"] = response.url
        article_item["create_date"] = create_date
        article_item["front_image_url"] = [front_image_url]
        article_item["praise_nums"] = praise_nums
        article_item["comment_nums"] = comment_nums
        article_item["fav_nums"] = fav_nums
        article_item["tags"] = tags
        article_item["content"] = content
        '''

        # Itemloader加载item
        front_image_url = response.meta.get("front_image_url",
                                            "")  # 封面图,用get不会抛异常
        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)
        item_loader.add_css('title', ".entry-header h1::text")
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_value('url', response.url)
        item_loader.add_css('create_date', "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_css('praise_nums', ".vote-post-up h10::text")
        item_loader.add_css('comment_nums',
                            "a[href='#article-comment'] span::text")
        item_loader.add_css('fav_nums', ".bookmark-btn::text")
        item_loader.add_css('tags', "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css('content', "div.entry")

        article_item = item_loader.load_item()

        yield article_item
コード例 #17
0
    def parse_detail(self, response):
        # 提取文章的具体字段

        # front_image_url = response.meta.get("front_image_url", "")  #文章封面图
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first()
        #
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().replace("·", "").strip()
        # try:
        #     create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d')
        # except Exception as e:
        #     create_date = datetime.datetime.now()
        #
        # praise_nums = int(response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first())
        #
        # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract_first()
        # match_re = re.match(r".*(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first()
        # match_re = re.match(r".*(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # # content = response.xpath("//div[@class='entry']").extract_first()
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item = JobboleArticleItem()
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        # 通过ItemLoader加载item
        front_image_url = response.meta.get("front_image_url", "")  #文章封面图

        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)
        item_loader.add_xpath('title',
                              "//div[@class='entry-header']/h1/text()")
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_xpath(
            'create_date', "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_xpath(
            'praise_nums',
            "//span[contains(@class,'vote-post-up')]/h10/text()")
        item_loader.add_xpath('comment_nums',
                              "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath(
            'fav_nums', "//span[contains(@class,'bookmark-btn')]/text()")
        item_loader.add_xpath(
            'tags', "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        # item_loader.add_xpath('content', "//div[@class='entry']")
        article_item = item_loader.load_item()

        yield article_item
コード例 #18
0
    def parse_detail(self, response):
        # article_item = JobboleArticleItem()
        #
        # front_img_url = response.meta.get('front_img_url', '')
        # print(front_img_url)
        #
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        #
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace('·', '').strip()
        # try:
        #     create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        #
        # thumb_up = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
        # if thumb_up:
        #     thumb_up = int(thumb_up)
        # else:
        #     thumb_up = 0
        #
        # save_num = 0
        # save_text = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # save_pattern = r'(.*)收藏'
        # save_match = re.match(save_pattern,save_text)
        # if save_match:
        #     save_target = save_match.group(1).strip()
        #     if save_target:
        #         save_num = int(save_target)
        #
        # comment_num = 0
        # comment_text = response.xpath("//a[@href = '#article-comment']/span/text()").extract()[0]
        # comment_pattern = r'(.*)评论'
        # comment_match = re.match(comment_pattern, comment_text)
        # if comment_match:
        #     comment_target = comment_match.group(1).strip()
        #     if comment_target:
        #         comment_num = int(comment_target)
        #
        # content = response.xpath("//div[@class = 'entry']").extract()[0]
        #
        # tag_raw = response.css(".entry-meta-hide-on-mobile > a ::text").extract()
        # tag = ','.join([element for element in tag_raw if '评论' not in element])
        #
        # article_item['url'] = response.url
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['title'] = title
        # article_item['create_date'] = create_date
        # article_item['thumb_up'] = thumb_up
        # article_item['save_num'] = save_num
        # article_item['comment_num'] = comment_num
        # article_item['content'] = content
        # article_item['tag'] = tag
        # article_item['front_img_url'] = [front_img_url]

        item_loader = ItemLoader(item=JobboleArticleItem(), response=response)

        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', response.url)
        item_loader.add_value('front_img_url', response.meta.get('front_img_url', ''))

        item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()')
        item_loader.add_xpath('create_date', "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loader.add_xpath('thumb_up', "//span[contains(@class, 'vote-post-up')]/h10/text()")
        item_loader.add_xpath('save_num', "//span[contains(@class, 'bookmark-btn')]/text()")
        item_loader.add_xpath('comment_num', "//a[@href = '#article-comment']/span/text()")
        item_loader.add_xpath('content', "//div[@class = 'entry']")
        item_loader.add_css('tag', ".entry-meta-hide-on-mobile > a ::text")

        article_item = item_loader.load_item()

        yield article_item
コード例 #19
0
ファイル: jobbole.py プロジェクト: zhaixuyan0523/Scrapy
    def parse_detail(self, response):
        article_item = JobboleArticleItem()
        # 提取文章的具体字段
        # front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        # title = response.css(".entry-header h1::text").extract()[0]
        # create_date = response.css(".entry-meta-hide-on-mobile::text").extract()[0].replace("·", "").strip()
        # praise_nums = response.css(".vote-post-up h10::text").extract()[0]
        #
        # fav_nums = response.css(".bookmark-btn::text").extract()[0]
        # match_re = re.match(".*(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re = re.match(".*(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        # else:
        #     comment_nums = 0
        #
        # content = response.css("div.entry").extract()[0]
        # tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # article_item["front_image_url"] = [front_image_url]
        # try:
        #     create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["praise_nums"] = praise_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["content"] = content
        # article_item["tags"] = tags

        # 通过item loader 加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response)

        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text')
        item_loader.add_css('praise_nums', '.vote-post-up h10::text')
        item_loader.add_css('fav_nums', '.bookmark-btn::text ')
        item_loader.add_css('comment_nums', "a[href='#article-comment'] span::text")
        item_loader.add_css('content', 'div.entry')
        item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text')

        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_value('url', response.url)
        item_loader.add_value('front_image_url', [front_image_url])

        article_item = item_loader.load_item()

        yield article_item
コード例 #20
0
 def parse_detail(self, response):
     #title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first()
     # create_time = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().strip().replace("·","").strip()
     # praise_number = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first()
     # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract_first()
     # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first()
     # content = response.xpath("//div[@class='entry']").extract_first()
     # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
     #article_item = JobboleArticleItem()
     # title = response.css("div.entry-header h1::text").extract_first()
     # create_time = response.css("p.entry-meta-hide-on-mobile::text").extract_first().strip().replace("·", "").strip()
     # praise_number = response.css("span.vote-post-up h10::text").extract_first()
     # fav_nums = response.css("span.bookmark-btn::text").extract_first()
     # fav_nums = re.findall("\d", fav_nums)
     # if fav_nums:
     #     fav_nums = fav_nums[0]
     # else:
     #     fav_nums = 0
     # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first()
     # comment_nums = re.findall("\d", comment_nums)
     # if comment_nums:
     #     comment_nums = comment_nums[0]
     # else:
     #     comment_nums = 0
     # content = response.css("div.entry").extract_first()
     # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract_first()
     # tags = ",".join(tag_list)
     # article_item["title"] = title
     # article_item["create_time"] = create_time
     # article_item["url"] = response.url
     # article_item["praise_number"] = praise_number
     # article_item["fav_nums"] = fav_nums
     # article_item["comment_nums"] = comment_nums
     # article_item["content"] = content
     # article_item["front_image_url"] = [front_image_url]
     # article_item["tags"] = tags
     # article_item["url_object_id"] = get_md5(response.url)
     #itemloader 加载item
     # item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response)
     # front_image_url = response.meta.get("front_image_url", "")
     # item_loader.add_css("title", "div.entry-header h1::text")
     # item_loader.add_value("front_image_url", [front_image_url])
     # item_loader.add_value("url", response.url)
     # item_loader.add_value("url_object_id", get_md5(response.url))
     # item_loader.add_css("praise_number", "span.vote-post-up h10::text")
     # item_loader.add_css("create_time", "p.entry-meta-hide-on-mobile::text")
     # item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
     # item_loader.add_css("content", "div.entry")
     # item_loader.add_css("fav_nums", "span.bookmark-btn::text")
     # item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
     # article_item = item_loader.load_item()
     # yield article_item
     item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                     response=response)
     front_image_url = response.meta.get("front_image_url", "")
     item_loader.add_css("title", "div.entry-header h1::text")
     item_loader.add_value("front_image_url", [front_image_url])
     item_loader.add_value("url", response.url)
     item_loader.add_value("url_object_id", get_md5(response.url))
     item_loader.add_css("praise_number", "span.vote-post-up h10::text")
     item_loader.add_css("create_time",
                         "p.entry-meta-hider-on-mobile::text")
     item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
     item_loader.add_css("content", "div.entry")
     item_loader.add_css("fav_nums", "span.bookmark-btn::text")
     item_loader.add_css("comment_nums",
                         "a[href='#article-comment'] span::text")
     article_item = item_loader.load_item()
     yield article_item
コード例 #21
0
ファイル: jobbole.py プロジェクト: china-bear/ArticleSpider
    def parse_detail(self, response):
        # 通过xpath选择器提取字段
        # title = response.xpath('//*[@class="entry-header"]/h1/text()').extract()[0].strip().replace('·','').strip()
        # create_date = response.xpath('//*[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace('·','').strip()
        # praise_num = int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0])
        # fav_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0]
        # fav_nums_match = re.match('.*(\d)+.*',fav_nums)
        # if(fav_nums_match):
        #     fav_nums = int(fav_nums_match.group(1))
        # else:
        #     fav_nums = int(0)
        # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0]
        # comment_nums_match = re.match('.*(\d)+.*',comment_nums)
        # if(comment_nums_match):
        #     comment_nums = int(comment_nums_match.group(1))
        # else:
        #     comment_nums = int(0)
        # content = response.xpath('//div[@class="entry"]').extract()[0]
        # tags_list = response.xpath('//*[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tags_list = [element for element in tags_list if not element.strip().endswith("评论")]
        # tags = ','.join(tags_list)

        # article_item = JobboleArticleItem()
        #
        # #通过css选择器提取字段
        # front_image_url = response.meta.get("front_image_url", "") #文章封面图
        #
        # title = response.css('.entry-header h1::text').extract()[0].strip().replace('·','').strip()
        #
        # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·','').strip()
        #
        # praise_nums = int(response.css('span.vote-post-up h10::text').extract()[0])
        #
        # fav_nums = response.css('span.bookmark-btn::text').extract()[0]
        # fav_nums_match = re.match('.*(\d)+.*', fav_nums)
        # if (fav_nums_match):
        #     fav_nums = int(fav_nums_match.group(1))
        # else:
        #     fav_nums = int(0)
        #
        # comment_nums = response.css('a[href="#article-comment"] span::text').extract()[0]
        # comment_nums_match = re.match('.*(\d)+.*', comment_nums)
        # if (comment_nums_match):
        #     comment_nums = int(comment_nums_match.group(1))
        # else:
        #     comment_nums = int(0)
        #
        # content = response.css('div.entry').extract()[0]
        #
        # tags_list = response.css('p.entry-meta-hide-on-mobile a::text').extract()
        # tags_list = [element for element in tags_list if not element.strip().endswith("评论")]
        # tags = ','.join(tags_list)
        #
        # article_item["title"] = title
        # article_item["url"] = response.url
        # article_item["url_object_id"] = common.md5(response.url)
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        # 通过css选择器提取字段
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图

        # 通过ItemLoader加载Item
        iter_loader = ArticlespiderItemLoader(item=JobboleArticleItem(),
                                              response=response)
        iter_loader.add_css("title", ".entry-header h1::text")
        iter_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        iter_loader.add_css("praise_nums", "span.vote-post-up h10::text")
        iter_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        iter_loader.add_css("fav_nums", "span.bookmark-btn::text")
        iter_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        iter_loader.add_css("content", "div.entry")
        iter_loader.add_value("url", response.url)
        iter_loader.add_value("url_object_id", common.md5(response.url))
        iter_loader.add_value("front_image_url", [front_image_url])

        article_item = iter_loader.load_item()
        yield article_item
コード例 #22
0
ファイル: jobbole.py プロジェクト: 19951025zzr/jobbole-spider
    def parse_detail(self, response):

        #  提取文章的具体字段
        #  通过xpath提取字段
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract()[0]
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace('·','').strip()
        # praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]
        #
        # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn ')]/text()").extract()[0]
        # fav_nums = re.findall('\d+', fav_nums)[0]
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        # comment_nums = re.findall('\d+', comment_nums)[0]
        #
        # #  保存带有所有标签的原文
        # content = response.xpath("//div[@class='entry']").extract()[0]
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # #  用列表生成式去掉评论的标签
        # tag_list = [element for element in tag_list if not element.strip().endswith('评论')]
        # #  把列表转换成指定格式的字符串保存
        # tags = ','.join(tag_list)

        #  通过CSS选择器提取字段
        # front_image_url = response.meta.get('front_image_url', '')
        # title = response.css('.entry-header h1::text').extract()[0]
        # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace(' ·', '')
        # praise_nums = response.css('.vote-post-up h10::text').extract()[0]
        # if praise_nums:
        #     praise_nums = int(praise_nums)
        # else:
        #     praise_nums = 0
        #
        # fav_nums = response.css('span.bookmark-btn::text').extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.css("div.entry").extract()[0]
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith('评论')]
        # tags = ','.join(tag_list)
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['title'] = title
        # article_item['url'] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [front_image_url]
        # article_item['praise_nums'] = praise_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['comment_nums'] = comment_nums
        # article_item['tags'] = tags
        # article_item['content'] = content
        article_item = JobboleArticleItem()

        #  通过 item loader 加载 item
        front_image_url = response.meta.get('front_image_url', '')  # 文章封面图
        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text')
        item_loader.add_css('praise_nums', '.vote-post-up h10::text')
        item_loader.add_css('fav_nums', 'span.bookmark-btn::text')
        item_loader.add_css('comment_nums',
                            "a[href='#article-comment'] span::text")
        item_loader.add_css('content', "div.entry")
        item_loader.add_css('tags', "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_value('url', response.url)
        item_loader.add_value('front_image_url', [front_image_url])

        article_item = item_loader.load_item()

        yield article_item
コード例 #23
0
    def parse_detail(self, response):

        front_image_url = response.meta.get(
            "front_image_url")  # 文章封面图,request传进来的
        #提取文章的具体字段
        # re_title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first(default='not-found')
        # re_time = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "").strip()
        # praise_num = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]
        # if praise_num == "":
        #     praise_num = 0
        # fav_num = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0].replace("收藏", "").strip()
        # if fav_num == "":
        #     fav_num = 0
        # comment_num = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0].replace("评论", "").strip()
        # if comment_num == "":
        #     comment_num = 0
        # content = response.xpath("//div[@class='entry']").extract()[0]
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith('评论')]
        # tag = ",".join(tag_list)
        #
        #
        # #通过css选择器
        # # title = response.css(".entry-header h1::text").extract()
        # # time = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "")
        # # praise_num = response.css(".vote-post-up h10::text").extract()[0]
        # # fav_num = response.css(".bookmark-btn::text").extract()[0].strip().replace("收藏", "").strip()
        # # comment_num = response.css("a[href='#article-comment'] span::text").extract()[0].strip().replace("评论", "").strip()
        #
        #
        # #在items中填充值
        # article_item = JobboleArticleItem()
        #
        # article_item["re_title"] = re_title
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["front_image_url"] = [front_image_url]
        # article_item["front_image_url_db"] = front_image_url
        # try:
        #     re_time = datetime.datetime.strptime(re_time, '%Y/%m/%d').date()
        # except Exception as e:
        #     re_time = datetime.datetime.now().date()
        # article_item["re_time"] = re_time
        # article_item["praise_num"] = praise_num
        # article_item["fav_num"] = fav_num
        # article_item["comment_num"] = comment_num
        # article_item["content"] = content
        # article_item["tag"] = tag
        # article_item["url"] = response.url

        # 通过Itemloader加载item
        item_loader = AticleItmeLoad(item=JobboleArticleItem(),
                                     response=response)
        # 直接加值
        item_loader.add_value("url", response.url)
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_value("front_image_url_db", front_image_url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        #提取后填充
        item_loader.add_xpath("re_title",
                              "//div[@class='entry-header']/h1/text()")
        item_loader.add_xpath("content", "//div[@class='entry']")
        item_loader.add_xpath(
            "praise_num", "//span[contains(@class,'vote-post-up')]/h10/text()")
        item_loader.add_xpath(
            "fav_num", "//span[contains(@class,'bookmark-btn')]/text()")
        item_loader.add_xpath(
            "praise_num", "//span[contains(@class,'vote-post-up')]/h10/text()")
        item_loader.add_xpath("comment_num",
                              "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath(
            "tag", "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        item_loader.add_css("re_time", "p.entry-meta-hide-on-mobile::text")
        article_item = item_loader.load_item()
        yield article_item
コード例 #24
0
    def parse_detail(self, response):
        article_item = JobboleArticleItem()


        # 提取文章的具体字段
        # title = response.xpath('//*[@id="post-113158"]/div[1]/h1/text()').extract_first("")
        # create_date = response.xpath('//*[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace("·", "").strip()
        # praise_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0]
        # fav_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0]
        # match_re = re.match(r".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0]
        # match_re = re.match(r".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        # content = response.xpath('//div[@class="entry"]').extract()[0]
        # tag_list = response.xpath('//*[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith(u"评论")]
        # tags = ",".join(tag_list)


        # 通过css选择器提取
        front_image_url = response.meta.get("front_image_url", "")    # 文章封面图
        title = response.css(".entry-header h1::text").extract()[0]
        create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "").strip()
        praise_nums = response.css("span.vote-post-up h10::text").extract()[0]
        fav_nums = response.css("span.bookmark-btn::text").extract()[0]
        match_re = re.match(r".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0
        comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        match_re = re.match(r".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0
        content = response.css("div.entry").extract()[0]
        tag_list = response.css('p.entry-meta-hide-on-mobile a::text').extract()
        tag_list = [element for element in tag_list if not element.strip().endswith(u"评论")]
        tags = ",".join(tag_list)


        article_item['url_object_id'] = get_md5(response.url)
        article_item["title"] = title
        article_item["url"] = response.url
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        article_item["create_date"] = create_date
        article_item["front_image_url"] = [front_image_url]
        article_item["praise_nums"] = praise_nums
        article_item["comment_nums"] = comment_nums
        article_item["fav_nums"] = fav_nums
        article_item["tags"] = tags
        article_item["content"] = content

        yield article_item
コード例 #25
0
    def parse_detail(self, response):
        # DOM元素id是全局唯一的;
        # scrapy 获取的源码是未执行js动态生成时的html源码(相当于browser中查看源码);
        # 而直接从browser通过开发者工具xpath Copy拿到的xpath是基于js执行动态生成后的html;
        # 因此通过从browser复制得到类似/div[1]/div[2]...这样的xpath有时是不靠谱的;
        # 而类似"*[@id="..."]"包含id 或者 "*[@class="header"]/..."包含实际内容class的 这样的比较靠谱
        # re_selector = response.xpath('//*[@id="post-114442"]/div[1]/h1')

        # response.xpath(...) 返回的是SelectorList;
        # 此处Selector元素[0]:<Selector xpath='//*[@class="entry-meta"]/p[1]/text()' data='\r\n\r\n            2018/10/15 ·  '>
        # (Selector or SelectorList) .extract() 返回( data or dataList)

        coverImg = response.meta.get("coverImg", "")
        title = response.css(".entry-header h1::text").extract_first()
        date = response.xpath(
            '//*[@class="entry-meta"]/p[1]/text()')[0].extract().replace(
                '·', '').strip()

        # < span data - post - id = "114442"class =" btn-bluet-bigger href-style vote-post-up   register-user-only " > < i class ="fa  fa-thumbs-o-up" > < / i > < h10 id="114442votetotal" > 1 < / h10 > 赞 < / span >
        thumbUp = int(
            response.xpath(
                "//span[contains(@class, 'vote-post-up')]/h10/text()").extract(
                )[0])

        favSpan = response.xpath(
            "//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        favMatch = re.match(".*?(\d+).*", favSpan)
        if favMatch:
            favNum = int(favMatch.group(1))
        else:
            favNum = 0

        comment = response.xpath(
            "//a[@href='#article-comment']/span/text()").extract()[0]
        commentMatch = re.match('.*?(\d+).*', comment)
        if commentMatch:
            commentNum = int(commentMatch.group(1))
        else:
            commentNum = 0

        tagList = response.xpath(
            "//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # Delete "k 评论" tag
        tagList = [
            element for element in tagList
            if not (element.strip().endswith("评论"))
        ]
        tags = ",".join(tagList)

        contentList = response.css("div.entry *::text").extract()
        content = " ".join(contentList)
        content = content.replace("\t", " ").replace("\n",
                                                     " ").replace("\r", " ")
        content = " ".join(content.split())

        item = JobboleArticleItem()
        item["title"] = title
        try:
            create_date = datetime.datetime.strptime(date, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        item["date"] = create_date
        item["url"] = response.url
        # item["urlObjId"]
        # item[url] would be processed as a list in "pipelines"
        item["coverImgUrl"] = [coverImg]
        item["thumbUp"] = thumbUp
        item["favNum"] = favNum
        item["commentNum"] = commentNum
        item["tags"] = tags
        item["content"] = content
        item["urlObjId"] = get_md5(response.url)

        #yield send item to pipeline if "settings" enable pipleline
        yield item

        pass