Exemple #1
0
    def parse_detail(self, response):
        #提取文章的具体字段
        #返回为SelectorList,方便进一步做selector筛选(嵌套selector)
        #front_image_url = response.meta['front_image_url']
        #使用get方法可以传递默认值,建议使用
        articleItem = ArticlespiderItem() #实例化
        #raise ValueError('Missing scheme in request url: %s' % self._url)改为列表
        # articleItem['front_image_url'] = [response.meta.get('front_image_url','')]
        # articleItem['title'] = response.xpath('//a[@id="cb_post_title_url"]/text()').extract_first("")
        # articleItem['url_object_id'] = get_md5(response.url)

        # #假设字段中有日期类型的,将爬取下来的字符串转化为日期
        # try
        #     date = datetime.datetime.strptime(date, "%Y/%m/%d").date()
        # except Exception as e:
        #     date = datetime.datetime.now().date()  #获取当前日期


        #通过itemLoader加载item
        item_loader = ItemLoader(item=ArticlespiderItem(), response=response)
        #item_loader.add_css()
        item_loader.add_xpath('title', '//a[@id="cb_post_title_url"]/text()')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_value('front_image_url', [response.meta.get('front_image_url','')])

        articleItem = item_loader.load_item() #articleItem里面所有的项默认都是list




        yield articleItem
Exemple #2
0
    def parse_details(self, response):
        news = ArticlespiderItem()

        img_url = response.meta.get("img_url", "")
        title = response.css(".entry-header h1::text").extract_first("")
        datetime = response.css(".entry-meta-hide-on-mobile::text").extract_first("").replace(" ·", "").strip()
        praise_num_str = response.css("#114676votetotal::text").extract_first("")
        if praise_num_str:
            praise_num = int(praise_num_str)
        else:
            praise_num = 0
        content = response.css(".entry p::text").extract()[0]

        news['title'] = title
        news['datetime'] = datetime
        news['praise_num'] = praise_num
        news['content'] = content
        news['img_url'] = [img_url]
        news['url_object_id'] = get_md5(img_url)


        item_loader=ItemLoader(item=ArticlespiderItem,response=response)
        item_loader.add_css()
        item_loader.add_xpath()
        item_loader.add_value()


        yield news
Exemple #3
0
    def detail(self, response):
        # item = ArticlespiderItem()
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first()
        # pubtime = response.xpath('//*[@id="post-113778"]/div[2]/p/text()').extract_first().strip().split(" ")[0]
        # tag_list = response.xpath('//*[@id="post-113778"]/div[2]/p/a/text()').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = "·".join(tag_list)
        # print("xpath选择器", title, pubtime, tags)
        #
        # title = response.css('div.entry-header h1::text').extract_first()
        # pubtime = response.css('.entry-meta-hide-on-mobile::text').extract()[0].strip().split(" ")[0]
        # try:
        #     pubtime = datetime.datetime.strptime(pubtime, '%Y/%m/%d').date()
        # except:
        #     pubtime = datetime.datetime.now().date()
        # tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = "·".join(tag_list)
        # # print("css  选择器", title, pubtime, tags)
        #
        # all_num = response.css('.post-adds')
        # praise_num = all_num.css('#113778votetotal::text').extract_first(0)
        # fav_num = all_num.css('.bookmark-btn::text').extract_first("0")
        # match_re = re.match('.*?(\d+).*', fav_num)
        # if match_re:
        #     fav_num = int(match_re.group(1))
        # else:
        #     fav_num = 0
        # comment_num = all_num.css('.fa.fa-comments-o::text').extract_first(0)
        # content = response.css('.entry p::text').extract()
        # contents = "__".join([element for element in content if element])
        # item['title'] = title
        # item['pubtime'] = pubtime
        # item['tags'] = tags
        # item['praise_num'] = praise_num
        # item['fav_num'] = fav_num
        # item['comment_num'] = comment_num
        # item['contents'] = contents
        # item['image_urls'] = response.meta.get('image_urls', "")
        # item['url_object_id'] = get_md5(response.url)
        # item['url'] = response.url

        # ItemLoader
        item_loader = ArticleItemLoder(item=ArticlespiderItem(),
                                       response=response)
        item_loader.add_css('title', 'div.entry-header h1::text')
        item_loader.add_css('pubtime', '.entry-meta-hide-on-mobile::text')
        item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text')
        item_loader.add_css('praise_num', '.post-adds span h10::text')
        item_loader.add_css('fav_num', '.bookmark-btn::text')
        item_loader.add_css('comment_num',
                            'a[href="#article-comment"] span::text')
        item_loader.add_css('content', 'div.entry p::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_value('image_urls',
                              [response.meta.get("image_urls", '')])

        article_item = item_loader.load_item()
        return article_item
Exemple #4
0
    def parse_detail(self, response):
        match_re = re.match(".*?(\d+)", response.url)
        if match_re:
            article_item = ArticlespiderItem()
            title = response.css("#news_title a::text").extract_first("")
            create_date = response.css("#news_info .time::text").extract_first(
                "")
            match_create_date = re.match('.*?(\d+.*)', create_date)
            if match_create_date:
                create_date = match_create_date.group(1)
            content = response.css("#news_content").extract()
            tag_list = response.css(".news_tags a::text").extract()
            tags = ','.join(tag_list)
            article_item['title'] = title
            article_item['create_date'] = create_date
            article_item['content'] = content
            article_item['tags'] = tags

            if response.meta.get('front_image_url', ''):
                article_item['front_image_url'] = [
                    response.meta.get('front_image_url', '')
                ]
            else:
                article_item['front_image_url'] = []
            article_item['url'] = response.url

            post_id = match_re.group(1)
            # requests是同步的库,可以放在yield中
            yield Request(url=parse.urljoin(
                response.url,
                "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
                          meta={'article_item': article_item},
                          callback=self.parse_nums)
Exemple #5
0
 def parse_detail(self, response):
     article_item = ArticlespiderItem()
     front_image_url = response.meta.get("front_image_url", "")
     item_loader = ArticleItemLoader(item=ArticlespiderItem(),
                                     response=response)
     item_loader.add_css("title", ".entry-header h1::text")
     item_loader.add_value("url", response.url)
     item_loader.add_value("url_object_id", get_md5.get_md5(response.url))
     item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
     item_loader.add_value("front_image_url", [front_image_url])
     item_loader.add_css("parise_nums", ".vote-post-up h10::text")
     item_loader.add_css("comment_nums",
                         "a[href='#article-comment'] span::text")
     item_loader.add_css("fav_nums", ".bookmark-btn::text")
     item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
     item_loader.add_css("content", "div.entry")
     article_item = item_loader.load_item()
     yield article_item
Exemple #6
0
    def parse_detail(self, response):
        match_re = re.match(".*?(\d+)", response.url)
        if match_re:
            post_id = match_re.group(1)
            # article_item = ArticlespiderItem()
            # title = response.css("#news_title a::text").extract_first("")
            # # title = response.xpath("//*[@id='news_title'//a/text()")
            # create_time = response.css("#news_info .time::text").extract_first("")
            # # create_date = response.xpath("//*[@id='news_info'//*[@class='time']/text()")
            # match_re = re.match(".*?(\d+.*)", create_time)
            # if match_re:
            #     create_time = match_re.group(1)
            # content = response.css("#news_content").extract()[0]
            # # content = response.xpath("//*[@id='news_content']").extract()[0]
            # tag_list = response.css(".news_tags a::text").extract()  # 无法存储list
            # # tag_list = response.xpath("//*[@class=news_tags']//a/text()")
            # tags = ",".join(tag_list)

            # post_id = match_re.group(1)
            # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))
            # 尽量不用同步的库
            # 打断点看是否符合要求
            # url路径 / 可以避免加入到子路径
            # j_data = json.loads(html.text)
            # article_item['title'] = title
            # article_item['create_time'] = create_time
            # article_item['content'] = content
            # article_item['tags'] = tags
            #
            # article_item['url'] = response.url
            # if response.meta.get("front_image_ur;", ""):
            #     article_item['front_image_url'] = [response.meta.get('front_image_url', "")]
            # else:
            #     article_item['front_image_url'] = []

            # 使用itemloader的代码,使得程序可以更加易于维护  匹配项
            item_loader = ArticleItemLoader(item=ArticlespiderItem(), response=response)
            item_loader.add_css("title", "#news_title a::text")
            item_loader.add_css("content", "#news_content")
            item_loader.add_css("tags", ".news_tags a::text")
            item_loader.add_css("create_time", "#news_info .time::text")
            item_loader.add_value("url", response.url)
            if response.meta.get('front_image_url', []):
                item_loader.add_value('front_image_url', response.meta.get('front_image_url', []))

            article_item = item_loader.load_item()
            if response.meta.get("front_image_ur;", ""):
                article_item['front_image_url'] = [response.meta.get('front_image_url', "")]
            else:
                article_item['front_image_url'] = []
            yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
                          meta={"article_item": article_item}, callback=self.parse_nums)
            # praise_nums = j_data['DiggCount']
            # fav_nums = j_data['TotalView']
            # comment_nums = j_data['CommentCount']
            pass
Exemple #7
0
    def parse_detail(self, response):
        front_img_url = response.meta.get('img_url', '')
        title = response.xpath(
            '//div[@class="entry-header"]/h1/text()').extract_first()
        create_date = response.xpath(
            '//p[@class="entry-meta-hide-on-mobile"]/text()').extract(
            )[0].strip().replace('·', '').strip()
        praise_num = int(response.xpath('//h10/text()').extract_first())
        # 收藏数文字串,如:39收藏
        fav_str = response.xpath(
            '//div[@class="post-adds"]//span[contains(@class,"bookmark-btn")]/text()'
        ).extract_first().strip()
        fav_regex_str = '.*?(\d+).*'
        fav_match_res = re.match(fav_regex_str, fav_str)
        if fav_match_res:
            fav_num = fav_match_res.group(1)
        else:
            fav_num = 0
        # 评论数的文字串,如:890评论
        comment_str = response.xpath(
            '//a[@href="#article-comment"]/span[contains(@class,"href-style")]/text()'
        ).extract_first().strip()
        comment_regex_str = '.*?(\d+).*'
        comment_match_res = re.match(comment_regex_str, comment_str)
        if comment_match_res:
            comment_num = comment_match_res.group(1)
        else:
            comment_num = 0
        tags_list = response.xpath(
            '//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract(
            )  # 结果是列表,如 ['IT技术', '开发']
        tags = ','.join(tags_list)
        content = '\n'.join(
            response.xpath('//div[@class="entry"]/p/text()').extract()
        )  # 结果是列表,如 ['IT技术', '开发']

        try:
            create_date = datetime.datetime.strptime(create_date,
                                                     '%Y/%m/%d').date()
        except Exception as e:
            create_date = datetime.datetime.date()
        article_item = ArticlespiderItem()
        article_item['title'] = title
        article_item['create_date'] = create_date
        article_item['url'] = response.url
        article_item['url_object_id'] = get_md5(response.url)
        article_item['front_img_url'] = [front_img_url]
        article_item['front_img_path'] = [IMAGES_STORE]
        article_item['praise_num'] = praise_num
        article_item['fav_num'] = fav_num
        article_item['comment_num'] = comment_num
        article_item['tags'] = tags
        article_item['content'] = content

        yield article_item
Exemple #8
0
    def parse_detail(self, response):
        # 标题
        title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()
        # 时间
        crttime_content = response.xpath('//div[@class="entry-meta"]/p/text()').extract()
        if len(crttime_content) == 0:
            create_time = 'no'
        else:
            create_time = crttime_content[0].replace('·', '').strip()
        # 文章类别
        article_kind_content = response.xpath('//div[@class="entry-meta"]/p/a/text()').extract()
        if len(article_kind_content) == 0:
            article_kind = 0
        else:
            article_kind = article_kind_content[0]
        # 点赞数
        praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
        # 收藏数
        fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        match_re = re.match(".*(\d+).*",fav_nums)
        if match_re:
            fav_nums = match_re.group(1)
        else:
            fav_nums = 0
        # 评论数
        commant_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        match_re = re.match(".*(\d+).*", commant_nums)
        if match_re:
            commant_nums = match_re.group(1)
        else:
            commant_nums = 0
        #内容
        # content = response.xpath("//div[@class='entry']").extract()
        # 作者姓名
        author_name_content = response.xpath("//div[@id='author-bio']//a/text()").extract()
        if len(author_name_content) == 0:
            author_name = 'no'
        else:
            author_name = author_name_content[0]

        item_loader = ArticleItemLoader(item=ArticlespiderItem(), response=response)
        item_loader.add_value('url', response.url)
        item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()')
        item_loader.add_value('create_time', [create_time])
        item_loader.add_value('article_kind', [article_kind])
        item_loader.add_value('praise_nums', [praise_nums])
        item_loader.add_value('fav_nums', [fav_nums])
        item_loader.add_value('commant_nums', [commant_nums])
       #item_loader.add_value('content', [content])
        item_loader.add_value('author_name', [author_name])
        article_item = item_loader.load_item()
        yield article_item
Exemple #9
0
    def parse_detail(self, response):
        article_item = ArticlespiderItem()

        # 通过css选择器提取字段
        front_image_url = response.meta.get("front_image_url", "")  #文章封面图
        title = response.css(".entry-header h1::text").extract()[0]
        create_date = response.css(
            "p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace(
                "·", "").strip()
        praise_nums = response.css(".vote-post-up h10::text").extract()[0]
        fav_nums = response.css(".bookmark-btn::text").extract()[0]
        match_re = re.match(".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.css(
            "a[href='#article-comment'] span::text").extract()[0]
        match_re = re.match(".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        content = response.css("div.entry").extract()[0]

        tag_list = response.css(
            "p.entry-meta-hide-on-mobile a::text").extract()
        tag_list = [
            element for element in tag_list
            if not element.strip().endswith("评论")
        ]
        tags = ",".join(tag_list)

        #article_item["url_object_id"] = get_md5(response.url)
        article_item["title"] = title
        article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now()
        # article_item["create_date"] = create_date
        article_item["front_image_url"] = [front_image_url]
        article_item["praise_nums"] = praise_nums
        article_item["comment_nums"] = comment_nums
        article_item["fav_nums"] = fav_nums
        article_item["tags"] = tags
        article_item["content"] = content

        yield article_item
    def _parse_data(self, response):
        # # 文章item
        # article = ArticlespiderItem()
        # # response对象的body属性就是需要解析的html_doc
        # create_selector = response.xpath('//*[@class="entry-meta-hide-on-mobile"]/text()')
        # # 文章标题
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract()
        # # 文章创建时间
        # create_date = create_selector.extract()[0].strip().replace("·", "")
        # coll = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0]
        # coll_re = re.compile(r".*?(\d+?).*")
        # coll_match = coll_re.match(coll)
        # # 收藏数
        # stars = 0
        # if coll_match is not None:
        #     stats = int(coll_match.group(1))
        # # 点赞数
        # try :
        #     thumb_ups = int(response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0])
        # except Exception as e:
        #     thumb_ups = 0
        # # 评论数
        # comments = 0
        # comm_re = re.compile(r".*?(\d+?).+")
        # comm_match = comm_re.match(response.xpath("//a[@href='#article-comment']/span/text()").extract()[0])
        # if comm_match is not None:
        #     comments = int(comm_match.group(1))
        # # 正文
        # content = response.xpath("//div[@class='entry']").extract()
        # # tag
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tags = ",".join([tag for tag in tag_list if not tag.strip().endswith('评论')])
        #
        # article['title'] = title
        # article['create_date'] = create_date
        # article['url'] = response.url
        # article['url_object_id'] = security.get_md5(response.url)
        # article['main_image_url'] = [response.meta.get("main_image_url", "")]
        # article['content'] = content
        # article['thumb_ups'] = thumb_ups
        # article['comments'] = comments
        # article['stars'] = stars
        # article['tags'] = tags

        # scrapy loader 对比上面...
        loader = ArticleItemLoader(item=ArticlespiderItem(), response=response)
        loader.add_value("url", response.url)
        loader.add_value("url_object_id", security.get_md5(response.url))
        loader.add_value("main_image_url",
                         [response.meta.get("main_image_url", "")])
        loader.add_xpath("title", "//div[@class='entry-header']/h1/text()")
        loader.add_xpath("create_date",
                         '//*[@class="entry-meta-hide-on-mobile"]/text()')
        loader.add_xpath("stars",
                         '//span[contains(@class,"bookmark-btn")]/text()')
        loader.add_xpath("thumb_ups",
                         "//span[contains(@class,'vote-post-up')]/h10/text()")
        loader.add_xpath("comments",
                         "//a[@href='#article-comment']/span/text()")
        loader.add_xpath("content", "//div[@class='entry']")
        loader.add_xpath("tags",
                         "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        # 返回的item需要通过处理函数才能转换成我们想要的类型
        article = loader.load_item()
        # return to pipelines
        yield article
Exemple #11
0
# -*- coding: utf-8 -*-