Example #1
0
    def parse_content(self,response):
        item_loader = ArticleItemLoader(item=DouBanItem(), response=response)
        item_loader.add_value("url",response.url)
        item_loader.add_xpath("title","//div[@id='content']/h1/span[1]/text()")
        #item_loader.add_xpath("time","//div[@id='content']/h1/span[2]/text()")
        item_loader.add_xpath("director","//div[@id='info']/span[1]/span[2]/a/text()")
        #item_loader.add_xpath("area","//*[@id='info']/text()[8]")
        #item_loader.add_xpath("language","//*[@id='info']/text()[10]")
        item_loader.add_css("score","div.rating_self strong::text")
        item_loader.add_xpath("introduction","//span[@property='v:summary']/text()")
        item_loader.add_xpath("front_image_url","//*[@id='mainpic']/a/img/@src")
        infos=response.xpath("//*[@id='info']/text()").extract()
        info_list=[]
        for info in infos:
            match_re = re.match(self.info_rule, info.strip())
            if match_re:
                info_list.append(match_re.group(1))
        time=response.xpath("//div[@id='content']/h1/span[2]/text()").extract()[0]
        match_re = re.match(self.time_rule, time)
        if match_re:
            item_loader.add_value("time",match_re.group(1))
        item_loader.add_value("area",info_list[0])
        item_loader.add_value("language",info_list[1])
        item_loader.add_value("nickname",info_list[2])

        douban_item = item_loader.load_item()
        yield douban_item
Example #2
0
    def parse_detail(self, response):
        # xpath 选取字段
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first()
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].replace('·', '').strip()
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        #
        # vote_post_up = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0])
        #
        # bookmark_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first()
        # if bookmark_nums:
        #     match_re = re.match(".*(\d+).*", bookmark_nums)
        #     if match_re:
        #         bookmark = int(match_re.group(1))
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/text()").extract_first()
        # if comment_nums:
        #     match_re = re.match(".*(\d+).*", comment_nums)
        #     if match_re:
        #         comment = int(match_re.group(1))

        # css 选取字段
        # article_item = JobBoleArticleItem()
        # article_item['title'] = title
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [response.meta['front_image_url']]
        # article_item['url'] = response.url
        # article_item['url_object_id'] = get_md5(response.url)

        #通过 ItemLoader 加载 Item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_xpath("title",
                              "//div[@class='entry-header']/h1/text()")
        item_loader.add_xpath(
            "create_date", "//p[@class='entry-meta-hide-on-mobile']/text()")
        # item_loader.add_xpath("vote_post_up", "//span[contains(@class, 'vote-post-up')]/h10/text()")
        # item_loader.add_xpath("bookmark_nums", "//span[contains(@class, 'bookmark-btn')]/text()")
        item_loader.add_xpath("comment_nums",
                              "//a[@href='#article-comment']/span/text()")

        item_loader.add_css("content", "div.entry")

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url",
                              response.meta.get("front_image_url", ""))

        article_item = item_loader.load_item()

        yield article_item
Example #3
0
    def parse_detail(self, response):
        match_re = re.match(".*?(\d+)", response.url)
        if match_re:
            post_id = match_re.group(1)
            # article_item = ArticlespiderItem()
            # title = response.css("#news_title a::text").extract_first("")
            # # title = response.xpath("//*[@id='news_title'//a/text()")
            # create_time = response.css("#news_info .time::text").extract_first("")
            # # create_date = response.xpath("//*[@id='news_info'//*[@class='time']/text()")
            # match_re = re.match(".*?(\d+.*)", create_time)
            # if match_re:
            #     create_time = match_re.group(1)
            # content = response.css("#news_content").extract()[0]
            # # content = response.xpath("//*[@id='news_content']").extract()[0]
            # tag_list = response.css(".news_tags a::text").extract()  # 无法存储list
            # # tag_list = response.xpath("//*[@class=news_tags']//a/text()")
            # tags = ",".join(tag_list)

            # post_id = match_re.group(1)
            # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))
            # 尽量不用同步的库
            # 打断点看是否符合要求
            # url路径 / 可以避免加入到子路径
            # j_data = json.loads(html.text)
            # article_item['title'] = title
            # article_item['create_time'] = create_time
            # article_item['content'] = content
            # article_item['tags'] = tags
            #
            # article_item['url'] = response.url
            # if response.meta.get("front_image_ur;", ""):
            #     article_item['front_image_url'] = [response.meta.get('front_image_url', "")]
            # else:
            #     article_item['front_image_url'] = []

            # 使用itemloader的代码,使得程序可以更加易于维护  匹配项
            item_loader = ArticleItemLoader(item=ArticlespiderItem(), response=response)
            item_loader.add_css("title", "#news_title a::text")
            item_loader.add_css("content", "#news_content")
            item_loader.add_css("tags", ".news_tags a::text")
            item_loader.add_css("create_time", "#news_info .time::text")
            item_loader.add_value("url", response.url)
            if response.meta.get('front_image_url', []):
                item_loader.add_value('front_image_url', response.meta.get('front_image_url', []))

            article_item = item_loader.load_item()
            if response.meta.get("front_image_ur;", ""):
                article_item['front_image_url'] = [response.meta.get('front_image_url', "")]
            else:
                article_item['front_image_url'] = []
            yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
                          meta={"article_item": article_item}, callback=self.parse_nums)
            # praise_nums = j_data['DiggCount']
            # fav_nums = j_data['TotalView']
            # comment_nums = j_data['CommentCount']
            pass
Example #4
0
 def parse_detail(self, response):
     article_item = TuiCoolArticleItem()
     # 通过item loader加载item
     front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
     flagTrue = response.meta.get("flag", "")  # 标识
     original = "http://www.tuicool.com/" + response.css(
         "span.from a::attr(href)").extract_first("")
     item_loader = ArticleItemLoader(item=TuiCoolArticleItem(),
                                     response=response)
     item_loader.add_css("title",
                         ".article_row_fluid div:nth-child(1) h1::text")
     item_loader.add_value("url", response.url)
     item_loader.add_value("url_object_id", get_md5(response.url))
     item_loader.add_css("create_date", "span.timestamp::text")
     item_loader.add_value("front_image_url", [front_image_url])
     item_loader.add_value("sites", original)
     item_loader.add_value("flag", flagTrue)
     item_loader.add_css("original", "div.source a::text")
     item_loader.add_css("tags", "span.new-label::text")
     item_loader.add_css("content", "div.article_body")
     article_item = item_loader.load_item()
Example #5
0
    def parse_detail(selfs, response):
        article_item = CnblogsArticleItem()

        # title = response.css("#cb_post_title_url::text").extract()[0]
        # create_date = response.css('#post-date::text').extract()[0]
        # author = response.css('.postDesc a::text').extract()[0]
        # # 动态生成的,暂时爬取不了
        # view_count = response.css('#post_view_count::text').extract()[0]
        # comment_count = response.css('#post_comment_count::text').extract()[0]
        #
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["author"] = author
        # article_item["url_object_id"] = get_md5(response.url)

        # 通过item loader加载item
        item_loader = ArticleItemLoader(item=CnblogsArticleItem(),
                                        response=response)
        item_loader.add_css("title", '#cb_post_title_url::text')
        item_loader.add_css('create_date', '#post-date::text')
        item_loader.add_css('author', '.postDesc a::text')
        item_loader.add_value('url', response.url)

        article_item = item_loader.load_item()

        yield article_item
Example #6
0
    def parse_detail(self, response):
        # article_item=JobBoleArticleItem()
        # front_image_url=response.meta.get("front_image_url","")
        # title=response.css('.entry-header h1::text').extract()
        # date=response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip()
        # content=response.css("div.entry").extract()[0]
        # # match_obj= re.match(".*项目",title)
        # # if match_obj:
        # #     print(match_obj.group(0))
        # article_item["url_object_id"]=get_md5(response.url)
        # article_item["title"]=title
        # article_item["url"]=response.url
        # article_item["date"]=date
        # article_item["front_image_url"]=[front_image_url]
        # article_item["content"]=content
        front_image_url = response.meta.get("front_image_url", "")
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_css("content", "div.entry")
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("url", response.url)
        item_loader.add_value("front_image_url", [front_image_url])

        article_item = item_loader.load_item()

        yield article_item
Example #7
0
  def parse_detail(self, response):
    match_re = re.match(".*?(\d+)", response.url)  # 先判断url有没有id
    if match_re:
      post_id = match_re.group(1)  # 此处提取post_id来给下面的的id赋值

      # article_Item = JoBoleArticleItem()

      # title = response.css("#news_title a::text").extract_first("")
      # time = response.css("#news_info .time::text").extract_first("")
      # match_re = re.match(".*?(\d+.*)", time)
      # if match_re:
      #   time = match_re.group(1)
      # content = response.css("#news_content").extract()[0]
      # tag_list = response.css(".news_tags a::text").extract()
      #
      # # title = response.xpath("//*[@id='news_title']//a/text()").extract_first("")
      # # time = response.xpath("//*[@id='news_info']//*[@class='time']/text()").extract_first("")
      # # content = response.xpath("//*[@id='news_content']").extract()[0]
      # # tag_list = response.xpath("//*[@class='news_tags']//a/text()").extract()
      # tags = ",".join(tag_list)
      #
      #
      #
      # # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))#注意因为requests是同步的而这里用的是一个异步的方法,所以换异步的方法实现,    得加/
      # # j_data = json.loads(html.text)
      #
      # article_Item["title"] = title#这里面一定要确保key是在item里面存在的
      # article_Item["time"] = time
      # article_Item["tags"] = tags
      # article_Item["content"] = content
      # article_Item["url"] = response.url
      # if response.meta.get("front_image_url", ""):
      #   article_Item["front_image_url"] = [response.meta.get("front_image_url","")]#可以通过此方法获得传递过来的图片值
      # else:
      #   article_Item["front_image_url"] = []

      item_loader = ArticleItemLoader(item=JoBoleArticleItem(),
                                      response=response)  # 记住这里的ArticleItemLoader类为items里自定义的方法
      item_loader.add_css("title", "#news_title a::text")
      item_loader.add_css("content", "#news_content")
      item_loader.add_css("tags", ".news_tags a::text")
      item_loader.add_css("time", "#news_info .time::text")
      item_loader.add_value("url", response.url)
      if response.meta.get("front_image_url", []):
        item_loader.add_value("front_image_url", response.meta.get("front_image_url", []))

      # 用yeid把上方法换成异步
      yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
                    meta={"article_item": item_loader, "url": response.url}, callback=self.parse_nums)
Example #8
0
    def parse_detail(self, response):
        match_re = re.match(".*?(\d+)", response.url)
        if match_re:
            post_id = match_re.group(1)

            # article_item = ArticleSpiderItem()
            # title = response.css("#news_title a::text").extract_first("")
            # create_date = response.css("#news_info .time::text").extract_first("")
            # match_re = re.match(".*?(\d+.*)", create_date)
            # if match_re:
            #     create_date = match_re.group(1)
            # content = response.css("#news_content").extract_first("")
            # tag_list = response.css(".news_tags a::text").extract()
            # tags = ",".join(tag_list)

            # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))
            # j_data = json.loads(html.text)
            # praise_nums = j_data["DiggCount"]
            # fav_nums = j_data["TotalView"]
            # comment_nums = j_data["CommentCount"]
            # article_item["title"] = title
            # article_item["create_date"] = create_date
            # article_item["content"] = content
            # article_item["tags"] = tags
            # article_item["url"] = response.url
            # if response.meta.get("front_image_url", ""):
            #     article_item["front_image_url"] = [response.meta.get("front_image_url", "")]
            # else:
            #     article_item["front_image_url"] = []

            item_loader = ArticleItemLoader(item=ArticleSpiderItem(),
                                            response=response)
            item_loader.add_css('title', '#news_title a::text')
            item_loader.add_css('create_date', '#news_info .time::text')
            item_loader.add_css('content', '#news_content')
            item_loader.add_css('tags', '.news_tags a::text')
            item_loader.add_value('url', response.url)
            if response.meta.get('front_image_url', []):
                item_loader.add_value('front_image_url',
                                      response.meta.get('front_image_url', []))

            yield Request(url=parse.urljoin(
                response.url,
                "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
                          meta={
                              "article_item": item_loader,
                              "url": response.url
                          },
                          callback=self.parse_nums)
Example #9
0
    def parse_detail(self, response):
        match_re = re.match(".*?(\d+)", response.url)
        if match_re:
            post_id = match_re.group(1)
            # article_item = JobBoleArticleItem()
            # title = response.css('#news_title a::text').extract_first('')
            # create_date = response.css('#news_info .time::text').extract_first('')
            # match_re = re.match(".*?(\d+.*)", create_date)
            # if match_re:
            #     create_date = match_re.group(1)
            # content = response.css('#news_content').extract()[0]
            # tag_list = response.css('.news_tags a::text').extract()
            # tags = ','.join(tag_list)
            #
            # article_item["title"] = title
            # article_item["create_date"] = create_date
            # article_item["content"] = content
            # article_item["tags"] = tags
            # img_url = response.meta.get('front_image_url',"")
            # if img_url:
            #     if img_url.startswith('http'):
            #         article_item["front_image_url"] = [img_url]
            #     else:
            #         article_item["front_image_url"] = ["https:"+img_url]
            # article_item["url"] = response.url

            item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                            response=response)
            item_loader.add_css("title", "#news_title a::text")
            item_loader.add_css("create_date", "#news_info .time::text")
            item_loader.add_css("content", "#news_content")
            item_loader.add_css("tags", ".news_tags a::text")
            item_loader.add_value("url", response.url)
            img_url = response.meta.get('front_image_url', "")
            if img_url:
                if img_url.startswith('http'):
                    item_loader.add_value("front_image_url", [img_url])
                else:
                    item_loader.add_value("front_image_url",
                                          ["https:" + img_url])
            yield Request(url=parse.urljoin(
                response.url,
                f'/NewsAjax/GetAjaxNewsInfo?contentId={post_id}'),
                          meta={
                              'article_item': item_loader,
                              'url': response.url
                          },
                          callback=self.parse_nums)
Example #10
0
 def parse_detail(self, response):
     sohuItem = SohuItem()
     front_image_url = response.meta.get("front_image_url", "")
     article_type = response.meta.get("article_type", "")
     item_loader = ArticleItemLoader(item=sohuItem, response=response)
     item_loader.add_css("title", ".text-title h1::text")
     item_loader.add_value("url", response.url)
     item_loader.add_value("front_image_url", front_image_url)
     item_loader.add_value("url_object_id", get_md5(response.url))
     item_loader.add_value("article_type", article_type)
     item_loader.add_css("author_name", ".user-info h4 a::text")
     item_loader.add_css("publish_time", ".article-info span::text")
     item_loader.add_css("content", "article")
     item_loader.add_value("crawl_time", datetime.now())
     sohuItem = item_loader.load_item()
     yield sohuItem
Example #11
0
    def parse_detail(self, response):
        match_re = re.match(".*?(\d+)", response.url)
        if match_re:
            post_id = match_re.group(1)

            # article_item = JobBoleArticleItem()
            # title = response.css("#news_title a::text").extract_first("")
            # # title = response.xpath("//*[@id='news_title']//a/text()").extract_first("")
            # create_date = response.css("#news_info .time::text").extract_first("")
            # match_re = re.match(".*?(\d+.*)", create_date)
            # if match_re:
            #     create_date = match_re.group(1)
            # #create_date = response.xpath("//*[@id='news_info']//*[@class='time']/text()").extract_first("")
            # content = response.css("#news_content").extract()[0]
            # # content = response.xpath("//*[@id='news_content']").extract()[0]
            # tag_list = response.css(".news_tags a::text").extract()
            # # tag_list = response.xpath("//*[@class='news_tags']//a/text()").extract()
            # tags = ",".join(tag_list)

            # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))
            # j_data = json.loads(html.text)
            # article_item["title"] = title
            # article_item["create_date"] = create_date
            # article_item["content"] = content
            # article_item["tags"] = tags
            # article_item["url"] = response.url
            # if response.meta.get("front_image_url", ""):
            #     article_item["front_image_url"] = [response.meta.get("front_image_url", "")]
            # else:
            #     article_item["front_image_url"] = []

            #更简介的写法

            item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
            item_loader.add_css("title", "#news_title a::text")
            item_loader.add_css("content", "#news_content")
            item_loader.add_css("tags", ".news_tags a::text")
            item_loader.add_css("create_date", "#news_info .time::text")
            item_loader.add_value("url", response.url)
            if response.meta.get("front_image_url", []):
                item_loader.add_value("front_image_url", response.meta.get("front_image_url", []))

            # article_item = item_loader.load_item()

            yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
                          meta={"article_item":item_loader, "url":response.url}, callback=self.parse_nums)
Example #12
0
    def parse_detail(self, response):
        match_re_id = re.match(".*?(\d+)", response.url)
        if match_re_id:
            post_id = match_re_id.group(1)

            # article_item = CnblogsArticleItem()
            # title = response.css("#news_title a::text").extract_first("")
            # create_time = response.css("#news_info .time::text").extract_first("")
            # match_re_time = re.match(".*?(\d+.*)", create_time)
            # if match_re_time:
            #     create_time = match_re_time.group(1)
            # content = response.css("#news_content").extract()[0]
            # tag_list = response.css(".news_tags a::text").extract()
            # tags = ",".join(tag_list)

            #
            # article_item["title"] = title
            # article_item["create_time"] = create_time
            # article_item["content"] = content
            # article_item["tags"] = tags
            # article_item["url"] = response.url
            # if response.meta.get("front_image_url", ""):
            #     article_item["front_image_url"] = [response.meta.get("front_image_url", "")]
            # else:
            #     article_item["front_image_url"] = []

            item_loader = ArticleItemLoader(item=CnblogsArticleItem(),
                                            response=response)
            item_loader.add_css("title", "#news_title a::text")
            item_loader.add_css("content", "#news_content")
            item_loader.add_css("tags", ".news_tags a::text")
            item_loader.add_css("create_time", "#news_info .time::text")
            item_loader.add_value("url", response.url)
            if response.meta.get("front_image_url", []):
                item_loader.add_value("front_image_url",
                                      response.meta.get("front_image_url", []))

            yield Request(url=parse.urljoin(
                response.url,
                "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
                          meta={
                              "article_item": item_loader,
                              "url": response.url
                          },
                          callback=self.parse_nums)
Example #13
0
    def parse_zl(self, response):
        LaGouArticleItem = ArticleItemLoader(item=ZhiLianItem(), response=response)
        LaGouArticleItem.add_css("job_name", '.fixed-inner-box h1::text')
        LaGouArticleItem.add_xpath("salary", "//div[@class='terminalpage-left']/ul/li[1]/strong/text()")
        LaGouArticleItem.add_xpath("job_exp", "//div[@class='terminalpage-left']/ul/li[5]/strong/text()")
        LaGouArticleItem.add_xpath("edu", "//div[@class='terminalpage-left']/ul/li[6]/strong/text()")
        LaGouArticleItem.add_xpath("job_type", "//div[@class='terminalpage-left']/ul/li[4]/strong/text()")
        LaGouArticleItem.add_xpath("work_city","//div[@class='terminalpage-left']/ul/li[2]/strong/a/text()")
        LaGouArticleItem.add_css("company_name",".inner-left a ::text")
        LaGouArticleItem.add_css("company_url",".inner-left a::attr(href)")
        LaGouArticleItem.add_css("work_addr",".terminalpage-main h2::text")
        #LaGouArticleItem.add_xpath("feedback","//div[@class='publisher_data']/div[2]/span[@class='tip']/i/text()")
        LaGouArticleItem.add_xpath("create_date","//div[@class='terminalpage-left']/ul/li[3]/strong")
        LaGouArticleItem.add_value("job_url", response.url)
        LaGouArticleItem.add_value("job_url_id",get_md5(response.url))
        LaGouArticleItem.add_css("job_advantage", ".welfare-tab-box ::text")
        LaGouArticleItem.add_xpath("job_desc","//div[@class='tab-inner-cont'][1]/p")
        LaGouArticleItem.add_xpath("tag","//div[@class='terminalpage-left']/ul/li[8]/strong/a/text()")
        ArticleItemLoder = LaGouArticleItem.load_item()

        return ArticleItemLoder
Example #14
0
 def parse_question(self, response):
     zhihu_question_item = ArticleItemLoader(item=ZhiHuQuestionItem(),
                                             response=response)
     question_id = response.meta.get('question_id', 0)
     zhihu_question_item.add_css('title', "h1.QuestionHeader-title::text")
     zhihu_question_item.add_value("question_id", question_id)
     zhihu_question_item.add_css("question_detail",
                                 ".QuestionHeader-detail")
     zhihu_question_item.add_css("tags", ".Tag-content .Popover div::text")
     # zhihu_question_item.add_css("follow_nums", ".QuestionFollowStatus .NumberBoard-itemValue")
     zhihu_question_item = zhihu_question_item.load_item()
     yield zhihu_question_item
     yield Request(self.query_next_answer_url.format(question_id, 20, 0),
                   headers=self.headers,
                   callback=self.parse_answers)
Example #15
0
    def parse_content(self, response):
        front_image_url = response.meta.get('front_image_url', '')

        # 通过 ItemLoader 加载 Item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)

        item_loader.add_css('title', 'h1::text')
        item_loader.add_css('content', 'div.entry')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', response.url)
        item_loader.add_value('front_image_url', front_image_url)
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")

        # 调用原生 ItemLoader 的 load_item 方法对规则进行解析并生成 item 对象时
        # 存在两个问题:
        # 1. item 的所有值都是一个 list
        # 2. 不能做过滤处理
        # 要解决该问题,需要定制 Item, ItemLoader
        item = item_loader.load_item()

        # yield item 到 pipeline
        yield item
Example #16
0
    def parse_job(self, response):
        '''item=ArticleSpiderItem()
        time.sleep(1)
        front_image_url = response.meta.get("front_image_url", "")#文章封面图
        title=response.css('#cb_post_title_url::text').extract()[0]
        date=response.css('#post-date::text').extract()[0]
        match_re=re.match(r'.*?((\d+).(\d+).(\d+))',date)
        if match_re:
            date=match_re.group(1)

        content=response.css('#cnblogs_post_body').extract()

        item['url_object_id'] = get_md5(response.url)
        item['title']=title
        try:
            date=datetime.datetime.strptime(date,"%Y/%m/%d").date()
        except Exception as e:
            date=datetime.datetime.now().date()
        item['date'] = date
        item['url'] = response.url
        item['front_image_url'] = [front_image_url]
        item['content'] = content'''
        #通过itemloader加载item
        #global front_image_url
        front_image_url = response.meta.get("front_image_url", "")
        item_loader = ArticleItemLoader(item=ArticleSpiderItem(),
                                        response=response)
        item_loader.add_css('title', '#cb_post_title_url::text')
        item_loader.add_value('url', response.url)  #直接添加值用add_value()
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('date', '#post-date::text')
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_css('content', '#cnblogs_post_body')

        item = item_loader.load_item()  #调用   问题1:会把所以值变成list 2还要加处理函数
        #print(item)
        #item_loader.add_xpath()
        #item_loader.add_value()
        yield item
Example #17
0
    def parse_detail(self, response):
        """
        提取文章
        """
        article_item = JobBoleAritcleItem()
        # title = response.xpath('//*[@id="post-112499"]/div[1]/h1/text()').extract()[0]
        # date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "")
        # priase_nums = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0])
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # content = response.xpath("//div[@class='entry']").extract()[0]
        # author = response.xpath("//a[@href='http://www.jobbole.com/members/hanxiaomax']/text()").extract_first("")
        # pass

        # title = response.css(".entry-header h1::text").extract()[0]
        # date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "")
        # priase_nums = response.css(".vote-post-up h10::text").extract()[0]
        # fav_nums = response.css("span.bookmark-btn::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = response.css("span.hide-on-480::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.css("div.entry").extract()[0]
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     date = datetime.datetime.strptime(date, "%Y/%m/%d").date()
        # except Exception as e:
        #     date = datetime.datetime.now().date()
        # article_item["create_date"] = date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = priase_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["content"] = content

        #通过Itemloader加载
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleAritcleItem(),
                                        response=response)
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text')
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_css('praise_nums', '.vote-post-up h10::text')
        item_loader.add_css('fav_nums', 'span.bookmark-btn::text')
        item_loader.add_css('comment_nums', 'span.hide-on-480::text')
        item_loader.add_css('content', 'div.entry')

        article_item = item_loader.load_item()

        yield article_item
Example #18
0
    def parse_detail(self, response):

        #提取文章的具体字段
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip()
        # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        #
        # content = response.xpath("//div[@class='entry']").extract()[0]
        #
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)

        #通过css选择器提取字段
        # front_image_url = response.meta.get("front_image_url", "")  #文章封面图
        # title = response.css(".entry-header h1::text").extract()[0]
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip()
        # praise_nums = response.css(".vote-post-up h10::text").extract()[0]
        # fav_nums = response.css(".bookmark-btn::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.css("div.entry").extract()[0]
        #
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        article_item = JobBoleArticleItem()
        #通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Example #19
0
    def parse_detail(self, response):
        # article_item = JobBoleArticleItem()
        #提取文章的具体字段
        # front_image_url = response.meta.get("front-img-url", "")  #文章封面图url
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first(default="")
        # creat_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().strip().replace("·","").strip()
        # praise_num = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract_first()
        # collect_num = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first()
        # match_re = re.match(".*?(\d+).*", collect_num)
        # if match_re:
        #     collect_num = int(match_re.group(1))
        # else:
        #     collect_num = 0
        #
        # comment_num = response.css(".btn-bluet-bigger.href-style.hide-on-480::text").extract_first()
        # match_re = re.match(".*?(\d+).*", comment_num)
        # if match_re:
        #     comment_num = int(match_re.group(1))
        # else:
        #     comment_num = 0
        #
        # content = response.xpath("//div[@class='entry']").extract_first()
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # article_item["front_image_url"] = [front_image_url]
        # try:
        #     creat_date = datetime.strptime(creat_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     creat_date = datetime.now().date()
        # article_item["creat_date"] = creat_date
        # article_item["praise_num"] = praise_num
        # article_item["collect_num"] = collect_num
        # article_item["comment_num"] = comment_num
        # article_item["content"] = content
        # article_item["tags"] = tags

        # 通过ItemLoder加载item
        front_image_url = response.meta.get("front-img-url", "")  # 文章封面图url
        item_loder = ArticleItemLoader(item=JobBoleArticleItem(),
                                       response=response)
        item_loder.add_xpath("title", "//div[@class='entry-header']/h1/text()")
        item_loder.add_xpath("creat_date",
                             "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loder.add_xpath(
            "praise_num",
            "//span[contains(@class, 'vote-post-up')]/h10/text()")
        item_loder.add_xpath(
            "collect_num", "//span[contains(@class, 'bookmark-btn')]/text()")
        item_loder.add_css("comment_num",
                           ".btn-bluet-bigger.href-style.hide-on-480::text")
        item_loder.add_xpath("content", "//div[@class='entry']")
        item_loder.add_xpath(
            "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        item_loder.add_value("url", response.url)
        item_loder.add_value("url_object_id", get_md5(response.url))
        item_loder.add_value("front_image_url", [front_image_url])

        article_item = item_loder.load_item()

        #传到pipeline
        yield article_item
Example #20
0
    def parse_detail(self,response):

    #     #提取文章的具体字段
    #     #/html/body/div[3]/div[3]/div[1]/div[1](div[3]的下标从1开始)
        front_image_url=response.meta.get('format_image','')#文章封面图
    #     title=response.xpath('//*[@class="entry-header"]/h1/text()').extract()#id是唯一表示
    #     post_time=response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace('·',' ').strip()#extract()表示把它变为一个列表访问下表为‘0’的元素,strip()是去掉空格换行,replace('需要替换的内容',‘替换成的内容’)
    #     praise_nums=int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0])#contains()函数表示span标签中的class属性的内容包含vote-post-up
    #     fav_nums=response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0]
    #     result=re.match('.*(\d+).*',fav_nums)
    #     if result:
    #         fav_nums=int(result.group(1))
    #     else :
    #         fav_nums=0
    #     comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0]
    #     result = re.match('.*(\d+).*',comment_nums)
    #     if result:
    #         comment_nums = int(result.group(1))
    #     else:
    #         comment_nums=0
    #     context=response.xpath('//div[@class="entry"]').extract()[0]
    #     tag_list=response.xpath('//*[@id="post-114253"]/div[2]/p/a/text()').extract()
    #     tag_list=[element for element in tag_list  if not element.strip().endswith('Git')]#去掉以前获取过的元素Git重复(不以Git为结尾的过滤掉)列表生成器
    #     #列表生成器过程:1.定义elemet 2.判断if not 结果为真时赋值给element,3.element添加列表中
    #     tags=','.join(tag_list)
    #     #往item当中填值
    #     '''  title=scrapy.Field()
    # post_time=scrapy.Field()
    # url=scrapy.Field()
    # url_object_id=scrapy.Field()
    # front_image_url=scrapy.Field()
    # front_image_path=scrapy.Field()
    # praise_nums=scrapy.Field()
    # comment_nums=scrapy.Field()
    # fav_nums=scrapy.Field()
    # tags=scrapy.Field()
    # content=scrapy.Field()'''
    #     article = ArticleItem()
    #     article['url_object_id']=get_md5(response.url)
    #     article['title']=title
    #     try:
    #         post_time=datetime.datetime.strptime(post_time,'%Y/%m/%d').date()
    #     except Exception as e:
    #         post_time=datetime.datetime.now()
    #     article['post_time']=post_time
    #     article['url']=response.url
    #     article['front_image_url']=[front_image_url]#下载图片的路径传递的是数组
    #     article['praise_nums']=praise_nums
    #     article['fav_nums']=fav_nums
    #     article['comment_nums']=comment_nums
    #     article['tags']=tags
    #     article['content']=context
    #     yield article#调用yield article会传递到pipelines.py 中
    #     #通过css选择器提取字段
    #     # title1=response.css('.entry-header h1::text').extract()[0]
    #     # post_time1=response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·',' ').strip()
    #     # praise_nums1=int(response.css('.vote-post-up h10::text').extract()[0])
    #     # fav_nums=response.css('.bookmark-btn::text').extract()[0]
    #     # result = re.match('.*?(\d+).*', fav_nums)
    #     # if result:
    #     #     fav_nums = int(result.group(1))
    #     # else:
    #     #     fav_nums=0
    #     # comment_nums=response.css('a[href="#article-comment"] span::text').extract()[0]
    #     # result = re.match('.*?(\d+).*', comment_nums)
    #     # if result:
    #     #     comment_nums = int(result.group(1))
    #     # else:
    #     #     comment_nums=0
#通过item Loader加载item
        front_image_url = response.meta.get('format_image', '')  # 文章封面图
        item_loader=ArticleItemLoader(item=ArticleItem(),response=response)#ArticleItemLoader是自定义的itemloader,他是继承itemloader,是在item.py文件重写的
        item_loader.add_css('title','.entry-header h1::text')
        item_loader.add_css('post_time','p.entry-meta-hide-on-mobile::text')
        item_loader.add_value('url',response.url)
        item_loader.add_value('url_object_id',get_md5(response.url))
        item_loader.add_value('front_image_url',[front_image_url])
        item_loader.add_css('praise_nums','.vote-post-up h10::text')
        item_loader.add_css('fav_nums','.bookmark-btn::text')
        item_loader.add_css('comment_nums','a[href="#article-comment"] span::text')
        item_loader.add_css('tags','p.entry-meta-hide-on-mobile a::text')
        item_loader.add_css('content','div.entry')
        article=item_loader.load_item()#利用item loader生成item
        yield article
Example #21
0
    def parse_detail(self, response):
        # 实例化item
        article_item = JobBoleArticleItem()

        # jobbole.py 解析字段,使用选择器
        # 首先需要实例化一个ItemLoader类的对象
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)  # 实例化一个对象
        """有三种重要的方法
        item_loader.add_css() # 通过css选择器选择的
        item_loader.add_xpath()
        item_loader.add_value()  # 不是选择器选择的,而是直接填充
        """

        # 通过item loader加载item
        # 获得meta中的front_image_url,文章封面图
        front_image_url = response.meta.get("front_image_url", "")
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("content", "div.entry")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")

        # 获取article_item
        article_item = item_loader.load_item()
        """
        调用默认的load_item()方法有两个问题,第一个问题会将所有的值变成一个list,虽然听起来不合理,但是从另外的角度来看,也是合理的
        因为通过css选择器取出来的极有可能就是一个list,不管是取第0个还是第1个,都是一个list,所以默认情况就是list
        如何解决问题呢,list里面只取第一个,以及对某个字段的list加一些额外的处理过程
        在item.py对字段进行定义,scrapy.Field()里面是有参数的,input_processor表示对输入的值预处理过程,后面MapCompose()类中可以传递很多函数名的参数,表示从左到右依次处理
        title = scrapy.Field(
            input_processor = MapCompose(add_jobbole)
        )
        """

        yield article_item  # 将item传递到pipeline中
Example #22
0
    def parse_detail(self, response):

        # 实例化JobBoleArticleItem
        article_item = JobBoleArticleItem()

        # 提取文章的具体字段
        # use CSS Selector to locate Element

        # 获取文章封面图
        # front_image_url = response.meta.get("front_image_url", "")
        #
        # # get title
        # title = response.css(".entry-header h1::text").extract()[0]      # CSS伪类选择器::
        #
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].replace("·", "").strip()      # 处理/r/n空格,处理点号,处理空格
        #
        # praise_nums = response.css(".vote-post-up h10::text").extract()[0]     # ' 2 收藏'
        #
        # fav_nums = response.css(".bookmark-btn::text").extract()[0]
        # match_re = re.match(r".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]    # ' 2 评论'
        # match_re = re.match(r".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.css("div.entry").extract()[0]
        #
        # # tag = response.css("p.entry-meta-hide-on-mobile a::text").extract()[0]    # '开发'
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()   # ['开发', ' 2 评论 ', '数据科学', '机器学习']
        #
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)     # '开发,数据科学,机器学习'
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title     # in items.py
        # article_item["url"] = response.url
        #
        # # need to convert create_date str to date
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        #
        # article_item["create_date"] = create_date
        #
        # article_item["front_image_url"] = [front_image_url]   # [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        # 通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")   # 文章封面图

        # 加载自定义item不能继承自ItemLoader,而是继承ArticleItemLoader
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        # item_loader.add_xpath()
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        # call item
        article_item = item_loader.load_item()

        # call yield , article_item will transfer to pipelines
        yield article_item
Example #23
0
    def parse_detail(self, response):
        article_item = JobboleArticleItem()

        # 提取文章的具体字段

        # 通过xpath提取文章具体字段
        """
        title = response.xpath('//*[@id="post-112051"]/div[1]/h1/text()').extract_first("") #extract_first("")提取不到,返回为空

        create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace(" ·","")

        praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]

        fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        match_re = re.match(r".*?(\d+).*", fav_nums)
        if match_re :
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first("")
        match_re = re.match(r".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0 #没有评论

        content = response.xpath("//div[@class='entry']").extract()[0]

        tags = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        tag_list = [elment for elment in tags if not elment.strip().endswith("评论")]
        tags = ",".join(tag_list)

        print(title, create_date, praise_nums, comment_nums, tags)
        """
        """
        # 通过css选择器提取字段
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图

        title = response.css(".entry-header h1::text").extract_first()

        create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace(" ·","")

        praise_nums = response.css(".vote-post-up h10::text").extract()[0]

        fav_nums = response.css("span.bookmark-btn::text").extract()[0]
        match_re = re.match(r".*?(\d+).*", fav_nums)

        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        match_re = re.match(r".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0 # 没有评论

        content = response.css("div.entry").extract()[0]

        tags = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        tag_list = [elment for elment in tags if not elment.strip().endswith("评论")]
        tags = ",".join(tag_list)

        print(title, create_date, praise_nums, comment_nums, tags)


        
        #   给Item填充值
        article_item["title"] = title
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        except Exception as error:
            create_date = datetime.datetime.now().date()
        article_item["create_date"] = create_date
        article_item["url"] = response.url
        article_item["url_object_id"] = get_md5(response.url)
        article_item["front_image_url"] = [front_image_url]  # 改为数组
        article_item["praise_nums"] = praise_nums
        article_item["comment_nums"] = comment_nums
        article_item["fav_nums"] = fav_nums
        article_item["tags"] = tags
        article_item["content"] = content
        """
        """
        通过item Loader来加载Item  ----> 在以后的开发中都是用ItemLoader来解析值
        """
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)

        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", "span.bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()
        """
        传递到pipeline类里面去
        """
        yield article_item
Example #24
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()
        ## 提取文章的具体字段
        ## xpath的提取元素的用法
        # re_selector = response.xpath('//*[@id="post-114000"]/div[1]/h1/text()')
        # re_selector = response.xpath('//*[@id="post-114000"]/div[1]/h1')
        # re_selector2 = response.xpath('//div[@class="entry-header"]/h1/text()')
        # title =  response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        # datetime = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip()
        # like_num = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0])
        #
        # collect_text = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # match_var = re.match(r'.*?(\d+).*', collect_text)
        # if match_var:
        #     collect_num = match_var.group(1)
        #
        # comment_text = response.xpath("//a[@href='#article-comment']/span[1]/text()").extract()[0]
        # match_var = re.match(r'.*?(\d+).*', comment_text)
        # if match_var:
        #     comment_num = match_var.group(1)
        #
        # context = response.xpath("//div[@class='entry']").extract()[0]

        ## css选择器提取元素的用法
        # front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        # title_css = response.css(".entry-header h1::text").extract()[0]
        # datetime_css = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip()
        # like_num_css = int(response.css(".vote-post-up h10::text").extract()[0])
        #
        # collect_text_css = response.css(".bookmark-btn::text").extract()[0]
        # match_var = re.match(r'.*?(\d+).*', collect_text_css)
        # if match_var:
        #     collect_num_css = match_var.group(1)
        #
        # comment_text_css = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_var = re.match(r'.*?(\d+).*', comment_text_css)
        # if match_var:
        #     comment_num_css = match_var.group(1)
        #
        # context_css = response.css(".entry").extract()[0]
        #
        # article_item["title"] = title_css
        # # datetime_css = "2018/05/12"
        # try:
        #     create_date = datetime.strptime(datetime_css, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.now().date()
        # article_item["datetime"] = create_date
        # article_item["like_num"] = like_num_css
        # article_item["url"] = response.url
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["front_image_url"] = [front_image_url]
        # article_item["collect_num"] = collect_num_css
        # article_item["context"] = context_css

        # 通过item Loader 来加载 item
        # 首先定义itemLoader 的实例
        # ItemLoader()函数有2个重要的参数
        # 第一个参数 item对应于  items.py中的JobBoleArticleItem()
        # 第二个参数是 response

        # 为了能够自动取list的第一个元素,我们自定义了ArticleItemLoader 继承 ItemLoader
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)

        #item_loader.add_css() 是通过网页中的样式进行提取,为 item_loader添加规则
        # 用add_css()方法的好处是 其代码较为清晰,并且参数css的样式可以通过对数据库查找动态的加载,而
        # 不是硬编码到程序中,方便进行动态配置
        # 第一个参数为item中 选项值
        # 第二个参数为css的样式
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("context", ".entry")
        item_loader.add_css("datetime", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_css("like_num", ".vote-post-up h10::text")
        item_loader.add_css("collect_num", ".bookmark-btn::text")
        item_loader.add_css("context", ".entry")

        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        # 如果不是通过css的样式取出对应的值,则使用 item_loader.add_value
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url", [front_image_url])

        # 调用 load_item() 方法,才会将 前面的规则进行解析
        article_item = item_loader.load_item()

        yield article_item
Example #25
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()

        # 通过 css选择器 提取字段
        # title = response.css(".entry-header h1::text").extract()[0]
        # create_date = response.css(".entry-meta-hide-on-mobile::text").extract()[0].strip().replace(' ·', '')
        # praise_nums = response.css(".vote-post-up h10::text").extract()[0]

        # fav_nums = response.css(".post-adds span:nth-child(2)::text").extract()[0]
        # match_re = re.match(r".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     # 如果没有匹配到数字 给一个默认值
        #     fav_nums = 0

        # comment_nums = response.css(".post-adds a span::text").extract_first()
        # match_re = re.match(r".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0

        # content = response.css(".hentry").extract()[0]

        # tag_list = response.css(".entry-meta-hide-on-mobile  a::text").extract()
        # tag_list = [ele for ele in tag_list if not ele.strip().endswith("评论")]
        # tags = ','.join(tag_list)

        # 获取图片信息
        # 取传入信息的默认值 防止未取到 设置默认 ‘’
        # 文章封面图
        # front_image_url = response.meta.get('front_image_url', '')

        # 此处下标从1开始
        # /html/body/div[1]/div[3]/div[1]/h1
        # // *[ @ id = "post-95521"] / div[1] / h1
        # re1_selector = response.xpath("/html/head/title")
        # re2_selector = response.xpath('//*[@id="post-95521"]/div[1]/h1')
        # re3_selector = response.xpath('//*[@class="entry-header"]/h1/text()')

        # 通过 xpath 提取字段
        # title = response.xpath('//*[@class="entry-header"]/h1/text()')
        # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', '')
        # praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract()[0]
        #
        # fav_nums = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract()[0]
        # match_re = re.match(r".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        #
        # comment_nums = response.xpath('//a[contains(@href, "#article-comment")]/span/text()').extract()[0]
        # match_re = re.match(r".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        #
        # content = response.xpath('//div[contains(@class, "hentry")]').extract()[0]
        #
        # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tag_list = [ele for ele in tag_list if not ele.strip().endswith("评论")]
        # tag  = ','.join(tag_list)

        # article_item["title"] = title
        # article_item["url"] = response.url
        # article_item["url_object_id"] = get_md5(response.url)
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["praise_nums"] = praise_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["tags"] = tags
        # article_item["content"] = content
        # article_item["front_image_url"] = [front_image_url]

        # 通过 item_loader 加载 item 便于项目后期维护
        # 用 scrapy 提供的 loader
        # item_loader = ItemLoader(item=JobBoleArticleItem(), response=response)
        # 用自定义的 loader

        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_css('create_date', ".entry-meta-hide-on-mobile::text")
        item_loader.add_css('praise_nums', '.vote-post-up h10::text')
        item_loader.add_css('fav_nums', '.post-adds span:nth-child(2)::text')
        item_loader.add_css('comment_nums', '.post-adds a span::text')
        item_loader.add_css('tags', '.entry-meta-hide-on-mobile  a::text')
        item_loader.add_css('content', '.hentry')
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url",
                              response.meta.get('front_image_url', ''))
        # item_loader.add_xpath()

        # 默认取出的所有 item 对象都是一个 list
        article_item = item_loader.load_item()

        yield article_item
Example #26
0
    def parse_detail(self, response):

        # re_selector = response.xpath('//*[@id="post-113771"]/div[1]/h1/text()')

        # article_item = JobBoleArticleItem()
        #
        # front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        # page_url = response.meta.get("page_url", "")  # 文章所在页面的url
        #
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        #
        # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace("·",
        #                                                                                                             "").strip()
        #
        # praise_nums = response.xpath(
        #     '//span[@class=" btn-bluet-bigger href-style vote-post-up   register-user-only "]/h10/text()').extract()[
        #     0].strip()
        # fav_nums = response.xpath(
        #     '//span[@class=" btn-bluet-bigger href-style bookmark-btn  register-user-only "]/text()').extract()[0]
        #
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0]
        # if match_re:
        #     comment_nums = match_re.group(1)
        # else:
        #     comment_nums = 0
        #
        # content = response.xpath("//div[@class='entry']/text()").extract()[0]
        #
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        #
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content
        # article_item["page_url"] = page_url

        # 通过Item Loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        page_url = response.meta.get("page_url", "")  # 文章所在页面的url

        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        # item_loader.add_css()
        # item_loader.add_value()
        item_loader.add_xpath("title", '//div[@class="entry-header"]/h1/text()')
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_xpath("create_date", '//p[@class="entry-meta-hide-on-mobile"]/text()')
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_value("page_url", "-")
        item_loader.add_xpath("comment_nums", '//a[@href="#article-comment"]/span/text()')
        item_loader.add_xpath("praise_nums",
                              '//span[@class=" btn-bluet-bigger href-style vote-post-up   register-user-only "]/h10/text()')
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        item_loader.add_xpath("content", '//div[@class="entry"]/text()')
        article_item = item_loader.load_item()

        yield article_item
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()
        # 提取文章的具体字段,也属于回调函数
        '''#通过XPATH提取字段
        #获取标题
        response.xpath('//*[@id="post-110287"]/div[1]/h1/text()').extract_first()
        #获取时间
        createdate = re.sub('\r|\n| |·', '', response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0])
        #获取点赞数
        thumbs = int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0])
        #获取收藏数
        bookmark = int(response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0].split()[0])
        #获取评论数
        comments = int(response.xpath("//div[@class='post-adds']/a/span/text()").extract()[0].split()[0])
        #获取正文
        contents = response.xpath('//div[@class="entry"]').extract()
        #获取career
        career = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a[1]/text()").extract()[0],\
                 response.xpath("//p[@class='entry-meta-hide-on-mobile']/a[3]/text()").extract()[0]
        career2 = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        career3 = [element for element in career2 if not element.strip().endswith("评论")]
        tags = ','.join(career3)
        #获取作者
        author =  response.xpath("//a[@href='http://www.jianshu.com/p/dc859753a035']/text()").extract_first()
        '''
        '''
        # 通过CSS选择器提取字段
        # 获取标题左侧图片,文章封面
        front_image_url = response.meta.get("front_image_url", "")
        # 获取标题
        title = response.css(".entry-header h1::text").extract()[0]
        # 获取时间
        createdate2 = re.sub('\r|\n| |·', '', response.css(".entry-meta-hide-on-mobile::text").extract()[0])
        try:
            createdate = datetime.strptime(createdate2, '%Y/%m/%d').date()
        except Exception as e:
            createdate = datetime.now().date()
        # 拿到url的md5值
        url_object_id = common.get_md5(response.url)
        # 获取点赞数
        m3 = re.search(r'\d+', response.css("span[class*='vote-post-up'] h10::text").extract_first())
        if m3:
            thumbs = int(m3.group(0))
        else:
            thumbs = None
        # 获取收藏数
        m1 = re.search(r'\d+', response.css("span[class*='bookmark-btn']::text").extract_first())
        if m1:
            bookmark = int(m1.group(0))
        else:
            bookmark = None
        # 获取评论数
        m2 = re.search(r'\d+', response.css("div.post-adds a span::text").extract_first())
        if m2:
            comments = int(m2.group(0))
        else:
            comments = None
        # 获取正文
        contents = response.css("div.entry").extract()
        # 获取career
        career = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        career2 = [element for element in career if not element.strip().endswith("评论")]
        tags = ','.join(career2)
        # 获取作者
        author = response.css(".copyright-area > a::text").extract_first()
        # author = response.css(".copyright-area a:nth-child(1)::text").extract_first()


        article_item["title"] = title
        article_item["createdate"] = createdate
        article_item["url"] = response.url
        article_item["url_object_id"] = url_object_id
        article_item["front_image_url"] = [front_image_url]
        article_item["thumbs"] = thumbs
        article_item["bookmark"] = bookmark
        article_item["comments"] = comments
        article_item["contents"] = contents
        article_item["tags"] = tags
        article_item["author"] = author

        yield article_item
       '''
        # 通过itemloader加载item
        front_image_url = response.meta.get("front_image_url", "")
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        # item_loader.add_css(),item_loader.add_value(),item_loader_add_xpath()
        # item_loader会自动做.extract()方法
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("createdate", ".entry-meta-hide-on-mobile::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", common.get_md5(response.url))
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("thumbs", "span[class*='vote-post-up'] h10::text")
        item_loader.add_css("bookmark", "span[class*='bookmark-btn']::text")
        item_loader.add_css("comments", "div.post-adds a span::text")
        item_loader.add_css("contents", "div.entry")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("author", ".copyright-area > a::text")

        article_item = item_loader.load_item()

        yield article_item
Example #28
0
    def parse_detail(self, response):
        # article_item = JobBoleArticleItem()
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        front_image_url = response.meta.get("front_image_url", "")
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Example #29
0
    def parse_detail(self, response):
        # 实例化JobBoleArticleItem
        article_item = JobBoleArticleItem()
        '''
        # re_selector = response.xpath("/html/body/div[3]/div[3]/div[1]/div[1]/h1")
        # re2_selector = response.xpath('//*[@id="post-112048"]/div[1]/h1/text()')
        title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        # 获取时间
        create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·",
                                                                                                                    "").strip()
        # 点赞数
        praise_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0]
        # 收藏数
        fav_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0]
        match_re = re.match(".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = match_re.group(1)
        # 评论数
        comment_nums = response.xpath('//a[@href="#article-comment"]/span').extract()[0]
        match_re = re.match(".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        #没有评论就设置默认值为0
        else:
            common_nums=0
        # 正文内容
        content = response.xpath('//div[@class="entry"]').extract()[0]
        tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        tags = ",".join(tag_list)

        '''
        '''
        python通过css选择器来提取元素
        '''
        # 通过css选择器来获取标题。
        front_image_url = response.meta["front_image_url"]  # 文章封面图
        # title = response.css(".entry-header h1::text").extract()[0]
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "").strip()
        # praise_nums = response.css("span.vote-post-up h10::text").extract()[0]
        # fav_nums = response.css("span.bookmark-btn::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.css("div.entry").extract()[0]
        # tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # # 填充值
        # article_item["title"] = title
        # article_item["url"] = response.url
        #
        # # 日期转换
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content
        # # MD5压缩赋值
        # article_item["url_object_id"] = get_md5(response.url)

        # 通过itemloader来加载item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        # item_loader.add_xpath() 暂时不用
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", "span.vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", "span.bookmark-btn::text")
        item_loader.add_css("tags", ".entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        # 解析填充的规则
        article_item = item_loader.load_item()
        # 异步调用
        yield article_item
Example #30
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()

        # 通过xpath提取文章的具体字段
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first()
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().replace(u"·","").strip()
        # praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract_first()
        # fav_nums = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract_first()
        # # fav_nums = re.sub('\D', '', fav_nums)
        # match_re = re.match(".*?(\d+).*?", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first()
        # comment_nums = re.sub('\D', '', comment_nums)
        # content = response.xpath('//div[@class="entry"]').extract_first()
        # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith(u'评论')]
        # tag = ','.join(tag_list)

        # 通过css选择器提取字段
        # title = response.css(".entry-header h1::text").extract_first('')
        # front_image_url = response.meta.get("front_image_url", "")
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract_first('').replace(u"·","").strip()
        # praise_nums = response.css(".vote-post-up h10::text").extract_first('')
        # fav_nums = response.css(".bookmark-btn::text").extract_first('')
        # fav_nums = re.sub('\D', '', fav_nums)
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first('')
        # comment_nums = re.sub('\D', '', comment_nums)
        # content = response.css('div.entry').extract_first()
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith(u'评论')]
        # tags = ','.join(tag_list)
        #
        # article_item["title"] = title
        # article_item["url"] = response.url
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        # 通过item_loader加载item
        front_image_url = response.meta.get("front_image_url", "")
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", front_image_url)
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")
        # 调用这个方法来对规则进行解析生成item对象
        article_item = item_loader.load_item()

        yield article_item
Example #31
0
    def parse_question(self, response):
        # 处理question页面, 从页面中提取出具体的question item

        question_id = response.meta.get('question_id', '')

        item_loader = ArticleItemLoader(item=ZhihuQuestionItem(),
                                        response=response)
        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        item_loader.add_css("content", ".QuestionHeader-detail")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", question_id)
        item_loader.add_css("answer_num", ".List-headerText span::text")
        item_loader.add_css("comments_num",
                            ".QuestionHeader-Comment span::text")
        item_loader.add_css(
            "watch_user_num",
            ".NumberBoard:first-child .NumberBoard-itemValue::text")
        item_loader.add_css(
            "click_num",
            ".NumberBoard:last-child .NumberBoard-itemValue::text")
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")

        question_item = item_loader.load_item()

        # 请求答案api
        answer_url = self.start_answer_url.format(question_id, 5, 0)
        yield scrapy.Request(answer_url,
                             headers=self.headers,
                             callback=self.parse_answer)

        yield question_item