예제 #1
0
파일: v2ex.py 프로젝트: z596593851/spider
    def parse_question(self, response):
        question_loader = ArticleItemLoader(item=V2exQuItem(), response=response)
        question_loader.add_xpath("title", "//div[@class='header']/h1/text()")
        content=""
        mar_content = response.xpath("//div[@class='markdown_body']").extract()
        if len(mar_content)==0:
            content="".join(response.xpath("//div[@class='topic_content']").extract()).replace("\n","")
        else:
            content="".join(mar_content).replace("\n","")
        match_re1 = re.match(self.content_rule, content)
        if match_re1:
            question_loader.add_value("content",match_re1.group(1))

        comment_count=response.xpath("//div[@class='cell']/span[@class='gray']/text()").extract()
        if len(comment_count)==0:
            question_loader.add_value("comment_count",0)
        else:
            match_re2 = re.match(self.comment_rule, comment_count[0])
            if match_re2:
                question_loader.add_value("comment_count", match_re2.group(1))
        question_loader.add_value("user_id",random.randint(2,14))
        question_loader.add_value("created_date",time.time())
        question_item=question_loader.load_item()
        yield question_item

        pass
예제 #2
0
    def parse_detail(self, response):
        # article_item = JobboleArticleItem()

        # 提取文章的具体字段
        # front_image_url = response.meta.get("front_image_url", "")     # 文章封面图
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first()
        # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', '')
        # fav_nums = response.xpath('//div[@class="post-adds"]/span[2]/h10/text()').extract_first()
        # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first()
        # content = response.xpath('//div[@class="entry"]').extract()[0]
        # tag_list = response.xpath('//div[@class="entry-meta"]/p/a/text()').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith('评论')]
        # tags = ','.join(tag_list)
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['title'] = title
        # article_item['url'] = response.url
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [front_image_url]
        # article_item['comment_nums'] = comment_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['tags'] = tags
        # article_item['content'] = content

        # 通过ItemLoader加载item
        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)
        item_loader.add_xpath('title',
                              '//div[@class="entry-header"]/h1/text()')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_xpath(
            'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()')

        front_image_url = response.meta.get("front_image_url", "")
        item_loader.add_value('front_image_url', [front_image_url])

        item_loader.add_xpath('comment_nums',
                              '//a[@href="#article-comment"]/span/text()')

        fav_nums = response.xpath(
            '//div[@class="post-adds"]/span[2]/h10/text()').extract_first()

        if fav_nums is None:
            fav_nums = '0'
        item_loader.add_value('fav_nums', fav_nums)

        item_loader.add_xpath('tags', '//div[@class="entry-meta"]/p/a/text()')
        item_loader.add_xpath('content', '//div[@class="entry"]')

        article_item = item_loader.load_item()

        yield article_item
예제 #3
0
    def parse_detail(self, response):
        # 标题
        title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()
        # 时间
        crttime_content = response.xpath('//div[@class="entry-meta"]/p/text()').extract()
        if len(crttime_content) == 0:
            create_time = 'no'
        else:
            create_time = crttime_content[0].replace('·', '').strip()
        # 文章类别
        article_kind_content = response.xpath('//div[@class="entry-meta"]/p/a/text()').extract()
        if len(article_kind_content) == 0:
            article_kind = 0
        else:
            article_kind = article_kind_content[0]
        # 点赞数
        praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
        # 收藏数
        fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        match_re = re.match(".*(\d+).*",fav_nums)
        if match_re:
            fav_nums = match_re.group(1)
        else:
            fav_nums = 0
        # 评论数
        commant_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        match_re = re.match(".*(\d+).*", commant_nums)
        if match_re:
            commant_nums = match_re.group(1)
        else:
            commant_nums = 0
        #内容
        # content = response.xpath("//div[@class='entry']").extract()
        # 作者姓名
        author_name_content = response.xpath("//div[@id='author-bio']//a/text()").extract()
        if len(author_name_content) == 0:
            author_name = 'no'
        else:
            author_name = author_name_content[0]

        item_loader = ArticleItemLoader(item=ArticlespiderItem(), response=response)
        item_loader.add_value('url', response.url)
        item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()')
        item_loader.add_value('create_time', [create_time])
        item_loader.add_value('article_kind', [article_kind])
        item_loader.add_value('praise_nums', [praise_nums])
        item_loader.add_value('fav_nums', [fav_nums])
        item_loader.add_value('commant_nums', [commant_nums])
       #item_loader.add_value('content', [content])
        item_loader.add_value('author_name', [author_name])
        article_item = item_loader.load_item()
        yield article_item
예제 #4
0
    def parse_content(self,response):
        item_loader = ArticleItemLoader(item=DouBanItem(), response=response)
        item_loader.add_value("url",response.url)
        item_loader.add_xpath("title","//div[@id='content']/h1/span[1]/text()")
        #item_loader.add_xpath("time","//div[@id='content']/h1/span[2]/text()")
        item_loader.add_xpath("director","//div[@id='info']/span[1]/span[2]/a/text()")
        #item_loader.add_xpath("area","//*[@id='info']/text()[8]")
        #item_loader.add_xpath("language","//*[@id='info']/text()[10]")
        item_loader.add_css("score","div.rating_self strong::text")
        item_loader.add_xpath("introduction","//span[@property='v:summary']/text()")
        item_loader.add_xpath("front_image_url","//*[@id='mainpic']/a/img/@src")
        infos=response.xpath("//*[@id='info']/text()").extract()
        info_list=[]
        for info in infos:
            match_re = re.match(self.info_rule, info.strip())
            if match_re:
                info_list.append(match_re.group(1))
        time=response.xpath("//div[@id='content']/h1/span[2]/text()").extract()[0]
        match_re = re.match(self.time_rule, time)
        if match_re:
            item_loader.add_value("time",match_re.group(1))
        item_loader.add_value("area",info_list[0])
        item_loader.add_value("language",info_list[1])
        item_loader.add_value("nickname",info_list[2])

        douban_item = item_loader.load_item()
        yield douban_item
예제 #5
0
    def parse_job(self, response):
        LaGouArticleItem = ArticleItemLoader(item=LaGouItem(),
                                             response=response)
        LaGouArticleItem.add_css("job_name", '.job-name::attr(title)')
        LaGouArticleItem.add_css("salary", ".salary::text")
        LaGouArticleItem.add_xpath(
            "job_exp", "//dd[@class='job_request']/p/span[3]/text()")
        LaGouArticleItem.add_xpath(
            "edu", "//dd[@class='job_request']/p/span[4]/text()")
        LaGouArticleItem.add_xpath(
            "job_type", "//dd[@class='job_request']/p/span[5]/text()")
        LaGouArticleItem.add_xpath(
            "work_city", "//dd[@class='job_request']/p/span[2]/text()")
        LaGouArticleItem.add_css("company_name", "#job_company .b2::attr(alt)")
        LaGouArticleItem.add_css("company_url",
                                 ".job_company dt a::attr(href)")
        LaGouArticleItem.add_css("work_addr", ".work_addr")
        #LaGouArticleItem.add_xpath("feedback","//div[@class='publisher_data']/div[2]/span[@class='tip']/i/text()")
        LaGouArticleItem.add_css("create_date", ".publish_time::text")
        LaGouArticleItem.add_value("job_url", response.url)
        LaGouArticleItem.add_value("job_url_id", get_md5(response.url))
        LaGouArticleItem.add_css("job_advantage", ".job-advantage p::text")
        LaGouArticleItem.add_css("job_desc", ".job_bt div")
        LaGouArticleItem.add_css("tag", ".position-label li")
        ArticleItemLoder = LaGouArticleItem.load_item()

        return ArticleItemLoder
예제 #6
0
    def parse_job(self, response):
        # 解析拉勾网的职位
        item_loader = ArticleItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']//span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']//span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']//span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']//span[5]/text()")

        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        return job_item
예제 #7
0
    def parse_detail(self, response):
        front_end_url = response.meta["front_end_url"]
        # title = response.xpath("//div[@class='post']/div[@class='article']/h1[@class='title']/text()").extract()[0]
        # print(title)
        # print(get_md5(response.url))
        # jianshu_item = JianShuArticlespiderItem()
        #
        # jianshu_item["url"] = response.url
        # jianshu_item["title"] = title
        # jianshu_item['front_image_url'] = [front_end_url]

        item = ArticleItemLoader(item=JianShuArticlespiderItem(), response=response)
        item.add_xpath("title", "//div[@class='post']/div[@class='article']/h1[@class='title']/text()")
        item.add_value("url", response.url)
        item.add_value("front_image_url", [front_end_url])
        jianshu = item.load_item()

        yield jianshu
예제 #8
0
    def parse_detail(self, response):
        # item = ArticleItem()
        image_url = response.meta.get('meta_1')
        # item['title'] = response.xpath('//h1/text()').extract_first()
        # item['image_url'] = [image_url]
        # item['url_object_id'] = get_md5(image_url)
        # item['image_path'] = ''
        # 通过itemloader 加载 item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)

        item_loader.add_xpath('title', '//h1/text()')
        item_loader.add_value('image_url', [image_url])
        item_loader.add_value('url_object_id', get_md5(image_url))
        item_loader.add_value('url_object_id', '')

        article_item = item_loader.load_item()
        yield article_item
예제 #9
0
    def parse_detail(self, response):
        match_re = re.match(".*?(\d+)", response.url)
        if match_re:
            post_id = match_re.group(1)
            # title = response.css("#news_title a::text").extract_first("")
            # title = response.xpath("//*[@id='news_title']//a/text()").extract_first("")
            # create_date = response.css("#news_info .time::text").extract_first("")
            # create_date = response.xpath("//*[@id='news_info']//*[@class='time']/text()").extract_first("")
            # date_re = re.match(".*?(\d+.*)", create_date)
            # if date_re:
            #     create_date = date_re.group(1)
            # content = response.css("#news_content").extract_first("")
            # content = response.xpath("//*[@id='news_content']").extract_first("")
            # tag_list = response.css(".news_tags a::text").extract()
            # tag_list = response.xpath("//*[@class='news_tags']//a/text()").extract()
            # tags = ",".join(tag_list)

            # 赋值
            # article_item = CnblogsArticleItem()
            # article_item['title'] = title
            # article_item['create_date'] = create_date
            # article_item['content'] = content
            # article_item['tags'] = tags
            # article_item['url'] = response.url
            # if response.meta.get("front_image_url", ""):
            #     front_image_url = response.meta.get("front_image_url", "")
            #     if not re.match('^https:.*', front_image_url):
            #         front_image_url = 'https:' + front_image_url
            #     article_item['front_image_url'] = [front_image_url]
            # else:
            #     article_item['front_image_url'] = []

            # 注意:使用绝对路径!直接加到域名后面!
            # 这是一个同步请求,后续的代码会被 block
            # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))
            # json_data = json.loads(html.text)

            # Use ItemLoader
            item_loader = ArticleItemLoader(item=CnblogsArticleItem(), response=response)
            item_loader.add_xpath('title', "//*[@id='news_title']//a/text()")
            item_loader.add_xpath('content', "//*[@id='news_content']")
            item_loader.add_xpath('tags', "//*[@class='news_tags']//a/text()")
            item_loader.add_xpath('create_date', "//*[@id='news_info']//*[@class='time']/text()")
            item_loader.add_value('url', response.url)
            item_loader.add_value('front_image_url', response.meta.get('front_image_url', ''))

            # article_item = item_loader.load_item()

            # 使用异步请求
            yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
                          meta={"item_loader": item_loader, "url_object_id": get_md5(response.url)},
                          callback=self.parse_nums)
예제 #10
0
    def parse_detail(self, response):
        match_re = re.match(".*?(\d+)", response.url)
        if match_re:
            post_id = match_re.group(1)

            ''' 被itemloader优化的代码
            article_item = CdnBlogArtcleItem()
            # title =  response.css('div#news_title a::text').extract_first("")
            # create_date = response.css('div#news_info span.time::text').extract_first("")
            # content =  response.css('div#news_content').extract_first("")
            # tag_list = response.css('div.news_tags a::text').extract()
            
            title = response.xpath('//*[@id="news_title"]//a/text()').extract_first("")
            create_date = response.xpath('//div[@id="news_info"]//span[@class="time"]/text()').extract_first("")
            match_re = re.match(".*?(\d+.*)", create_date)
            if match_re:
                create_date = match_re.group(1)
            content = response.xpath('//div[@id="news_content"]').extract_first("")
            tag_list = response.xpath('//div[@class="news_tags"]//a/text()').extract()
            tags = ",".join(tag_list)

            article_item["title"] = title
            article_item["create_date"] = create_date
            article_item["url"] = response.url
            article_item["content"] = content
            article_item["tags"] = tags
            image_url = response.meta.get("front_image_url", "")
            if image_url:
                article_item["front_image_url"] = [image_url]
            else:
                article_item["front_image_url"] = []
            '''

            item_loader = ArticleItemLoader(item=CdnBlogArtcleItem(), response=response)
            item_loader.add_xpath("title", '//*[@id="news_title"]//a/text()')
            item_loader.add_xpath("create_date", '//div[@id="news_info"]//span[@class="time"]/text()')
            item_loader.add_xpath("content", '//div[@id="news_content"]')
            item_loader.add_xpath("tags", '//div[@class="news_tags"]//a/text()')
            item_loader.add_value("url", response.url)
            if  response.meta.get("front_image_url", []):
                item_loader.add_value("front_image_url", response.meta.get("front_image_url", []))


            # 同步請请求和异步请求
            # html = requests.get("https://news.cnblogs.com/NewsAjax/GetAjaxNewsInfo?contentId=654012")
            # html = requests.get(parse.urljoin(response.url,"/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))
            # j_data = json.load(html.text)
            # '{"ContentID":654012,"CommentCount":0,"TotalView":31,"DiggCount":0,"BuryCount":0}

            # yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),meta={"article_item", article_item}, callback=self.parse_nums)
            yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"article_item": item_loader, "url": response.url}, callback=self.parse_nums)
예제 #11
0
    def parse_list(self, response):
        # 实例化项目
        post_url = response.xpath('//*[@id="fmimg"]/img/@src').extract_first(
            "")  # 图片
        front_image_url = parse.urljoin(response.url, post_url)
        list = response.xpath('//*[@id="list"]/dl/dd/a/@href').extract()  # 列表

        #   通过自定义ArticleItemLoader加载item
        item_loader = ArticleItemLoader(item=BiQuGeListItem(),
                                        response=response)
        # xpath 获取方式
        item_loader.add_value("url_object_id", get_md5(response.url))  # MD5ID
        item_loader.add_xpath("title", '//*[@id="info"]/h1/text()')  # 文章标题
        item_loader.add_xpath("author", '//*[@id="info"]/p[1]/text()')  # 作者
        item_loader.add_xpath("last_update_time",
                              '//*[@id="info"]/p[3]/text()')  # 最后更新时间
        item_loader.add_value("front_image_url", [front_image_url])  # 图片下载链接

        article_item = item_loader.load_item()

        yield article_item

        # 循环爬取详情页
        if list:
            post_urls = []
            for each in list:
                # post_urls.append(parse.urljoin(response.url, each))
                yield Request(
                    url=parse.urljoin(response.url, each),
                    meta={"url_object_id": article_item['url_object_id']},
                    callback=self.parse_details)

        pass
예제 #12
0
    def parse_detail(self, response):
        # xpath 选取字段
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first()
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].replace('·', '').strip()
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        #
        # vote_post_up = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0])
        #
        # bookmark_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first()
        # if bookmark_nums:
        #     match_re = re.match(".*(\d+).*", bookmark_nums)
        #     if match_re:
        #         bookmark = int(match_re.group(1))
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/text()").extract_first()
        # if comment_nums:
        #     match_re = re.match(".*(\d+).*", comment_nums)
        #     if match_re:
        #         comment = int(match_re.group(1))

        # css 选取字段
        # article_item = JobBoleArticleItem()
        # article_item['title'] = title
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [response.meta['front_image_url']]
        # article_item['url'] = response.url
        # article_item['url_object_id'] = get_md5(response.url)

        #通过 ItemLoader 加载 Item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_xpath("title",
                              "//div[@class='entry-header']/h1/text()")
        item_loader.add_xpath(
            "create_date", "//p[@class='entry-meta-hide-on-mobile']/text()")
        # item_loader.add_xpath("vote_post_up", "//span[contains(@class, 'vote-post-up')]/h10/text()")
        # item_loader.add_xpath("bookmark_nums", "//span[contains(@class, 'bookmark-btn')]/text()")
        item_loader.add_xpath("comment_nums",
                              "//a[@href='#article-comment']/span/text()")

        item_loader.add_css("content", "div.entry")

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url",
                              response.meta.get("front_image_url", ""))

        article_item = item_loader.load_item()

        yield article_item
예제 #13
0
    def detail_parse(self, response):
        # item = ArticleItem()
        # item['url'] = response.url
        # item['url_object_id'] = get_md5(response.url)
        # item['title'] = response.xpath('//div[@class="main_left"]//h2/text()').extract_first()  # 新闻标题
        # create_time = response.xpath('//div[@class="meta"]/span/text()').extract_first()  # 发布时间
        # create_time = create_time.split(' ')[0]
        # try:
        #     create_time = datetime.datetime.strptime(create_time, '%Y-%m-%d').date()
        # except Exception as e:
        #     create_time = datetime.datetime.now().date()
        # item['create_time'] = create_time
        # item['content'] = response.xpath('//div[@class="wen_article"]').extract_first()  # 文章正文

        #  通过item loader 加载item
        item_loader = ArticleItemLoader(item=ArticleItem(), response=response)
        item_loader.add_xpath('title', '//div[@class="main_left"]//h2/text()')
        item_loader.add_xpath('content', '//div[@class="wen_article"]')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_xpath('create_time',
                              '//div[@class="meta"]/span/text()')
        article_loader = item_loader.load_item()
        yield article_loader
예제 #14
0
    def parse_detail(self, response):
        # article_item = JobBleArticleItem()

        #提取文章的具体字段
        # front_image_url = response.meta.get("front_image_url","")
        # title = response.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/h1/text()').extract()[0]
        # create_date = response.xpath('/html/body/div[1]/div[3]/div[1]/div[2]/p/text()').extract()[0].strip().replace("·","").strip()
        # praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract()[0]
        # fav_nums = response.css('.bookmark-btn::text').extract()[0]
        # match_re = re.match(".*?(\d+).*",fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = response.css(".btn-bluet-bigger.href-style.hide-on-480::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.xpath("//div[@class='entry']").extract()[0]
        # tag_list =response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d")
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content
        #通过itemloader加载
        front_image_url = response.meta.get("front_image_url", "")
        itemloader = ArticleItemLoader(item=JobBleArticleItem(),
                                       response=response)
        itemloader.add_xpath(
            "title", '/html/body/div[1]/div[3]/div[1]/div[1]/h1/text()')
        itemloader.add_value("url", response.url)
        itemloader.add_value('url_object_id', get_md5(response.url))
        itemloader.add_xpath(
            "create_date", '/html/body/div[1]/div[3]/div[1]/div[2]/p/text()')
        itemloader.add_value("front_image_url", [front_image_url])
        itemloader.add_xpath(
            'praise_nums',
            '//span[contains(@class, "vote-post-up")]/h10/text()')
        itemloader.add_css('comment_nums',
                           ".btn-bluet-bigger.href-style.hide-on-480::text")
        itemloader.add_css('fav_nums', '.bookmark-btn::text')
        itemloader.add_xpath(
            'tags', '//p[@class="entry-meta-hide-on-mobile"]/a/text()')
        itemloader.add_xpath('content', "//div[@class='entry']")
        article_item = itemloader.load_item()

        yield article_item
예제 #15
0
    def parse_detail(self, response):
        # article_item = JobBoleArticleItem()
        front_image_url = response.meta.get('front_image_url', '')  # 文章封面图的url
        # # 提取文章的具体字段
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first()
        # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace(
        #     '·', '').strip()
        # praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first()
        # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract_first()
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = response.xpath("//a[@href='#article-comment']/text()").extract_first()
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.xpath("//div[@class='entry']/p/text()").extract()
        # content = ''.join(content)
        #
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [tag for tag in tag_list if not tag.strip().endswith('评论')]
        # tags = ','.join(tag_list)
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date,'%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["tags"] = tags
        # article_item["fav_nums"] = fav_nums
        # article_item["content"] = content

        #通过Itenmloader加载item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)  #实例化(具体化哪个需要loader)
        item_loader.add_xpath('title',
                              '//div[@class="entry-header"]/h1/text()')
        item_loader.add_xpath(
            'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()')
        item_loader.add_value('url', response.url)  #直接传值给url
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_xpath('comment_nums',
                              "//a[@href='#article-comment']/text()")
        item_loader.add_xpath(
            'fav_nums', "//span[contains(@class,'bookmark-btn')]/text()")
        item_loader.add_xpath(
            'tags', "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        item_loader.add_xpath('content', "//div[@class='entry']/p/text()")

        article_item = item_loader.load_item()  #解析到item中去

        yield article_item
예제 #16
0
    def parse_page(self, response):
        """
        解析文章详情页,提取标题、创建时间、点赞数、收藏数、评论数以及文章标签
        """

        # 通过 itemloader 加载item
        front_image_url = parse.urljoin(response.url, response.meta.get("front_image_url", ""))
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_xpath("title", "/html/head/title/text()")
        item_loader.add_xpath("create_date", "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_id_md5", get_md5(response.url))
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_xpath("praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()")
        item_loader.add_xpath("fav_nums", "//span[@class=' btn-bluet-bigger href-style bookmark-btn  register-user-only ']/text()")
        item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span")
        item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        item_loader.add_xpath("content", "//div[@class='entry']")

        article_item = item_loader.load_item()

        yield article_item
예제 #17
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()

        # 提取文章的具体字段
        # url = 'http://blog.jobbole.com/110287/'
        # 标题,创建日期,点赞数,喜欢数,评论数,正文 的提取
        front_image_url = response.meta.get("front_image_url", "")
        title = response.xpath(
            '//*[@class="entry-header"]/h1/text()').extract_first()  # 文章封面图
        create_data = response.xpath(
            "//*[@class='entry-meta']/p/text()").extract()[0].split()[0]
        praise_nums = response.xpath(
            "//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]
        fav_nums = response.xpath(
            "//span[contains(@class,'bookmark-btn')]/text()").extract_first(
            ).split()[0].split(' ')[0]
        fav_re = re.match(".*?(\d+).*", fav_nums)
        # if fav_re:
        #     fav_nums = int(fav_re.group(1))
        # else:
        #     fav_nums = 0

        comment_nums = response.xpath(
            "//a[@href='#article-comment']/span/text()").extract()[0]
        comment_re = re.match(".*?(\d+).*", comment_nums)
        # if comment_re:
        #
        #     comment_nums = int(comment_re.group(1))
        # else:
        #     comment_nums = 0
        content = response.xpath("//div[@class='entry']").extract()[0]
        type_tag = response.xpath(
            "//*[@class='entry-meta']/p/a/text()").extract()[0].split()[0]
        tag_list = response.css(
            "p.entry-meta-hide-on-mobile a::text").extract()
        tag_list = [
            element for element in tag_list
            if not element.strip().endswith("评论")
        ]
        tags = ",".join(tag_list)
        author_type = response.xpath("//div[@class='copyright-area']/text()"
                                     ).extract()[0]  # 本文作者: ,原文出处: ;
        # if author_type == '本文作者: ':
        #     author = response.xpath("//div[@class='copyright-area']/a/text()").extract()[1]
        # elif author_type == '原文出处: ':
        #     author = response.xpath("//div[@class='copyright-area']/a/text()").extract()[0]
        # tags = type_tag + ',' + author
        article_item["url_object_id"] = get_md5(response.url)
        article_item['title'] = title
        article_item['url'] = response.url
        try:
            create_data = datetime.datetime.strptime(create_data,
                                                     "%Y/%m/%d").date()
        except Exception as e:
            create_data = datetime.datetime.now().date()
        article_item['create_date'] = create_data
        article_item['front_image_url'] = [front_image_url]
        article_item['praise_nums'] = praise_nums
        article_item['comment_nums'] = comment_nums
        article_item['fav_nums'] = fav_nums
        article_item['tags'] = tags
        article_item['content'] = content

        # 通过 item Loader 可以将css xpath 维护工作变的简单 item loader是个容器
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)

        # item_loader.add_css("title","    ")
        item_loader.add_xpath("title", "//*[@class='entry-header']/h1/text()")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_xpath("create_date",
                              "//*[@class='entry-meta']/p/text()")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_xpath(
            "praise_nums",
            "//span[contains(@class,'vote-post-up')]/h10/text()")
        item_loader.add_xpath("comment_nums",
                              "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath(
            "fav_nums", "//span[contains(@class,'bookmark-btn')]/text()")
        item_loader.add_xpath("tags", "//*[@class='entry-meta']/p/a/text()")
        item_loader.add_xpath("content", "//div[@class='entry']")

        article_item = item_loader.load_item()  # 结果为list 还需要对数据做筛选,去掉冗余数据

        yield article_item  # 提交 article_item
예제 #18
0
    def parse_detail(self, response):

        # re_selector = response.xpath('//*[@id="post-113771"]/div[1]/h1/text()')

        # article_item = JobBoleArticleItem()
        #
        # front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        # page_url = response.meta.get("page_url", "")  # 文章所在页面的url
        #
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        #
        # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace("·",
        #                                                                                                             "").strip()
        #
        # praise_nums = response.xpath(
        #     '//span[@class=" btn-bluet-bigger href-style vote-post-up   register-user-only "]/h10/text()').extract()[
        #     0].strip()
        # fav_nums = response.xpath(
        #     '//span[@class=" btn-bluet-bigger href-style bookmark-btn  register-user-only "]/text()').extract()[0]
        #
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0]
        # if match_re:
        #     comment_nums = match_re.group(1)
        # else:
        #     comment_nums = 0
        #
        # content = response.xpath("//div[@class='entry']/text()").extract()[0]
        #
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        #
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content
        # article_item["page_url"] = page_url

        # 通过Item Loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        page_url = response.meta.get("page_url", "")  # 文章所在页面的url

        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        # item_loader.add_css()
        # item_loader.add_value()
        item_loader.add_xpath("title", '//div[@class="entry-header"]/h1/text()')
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_xpath("create_date", '//p[@class="entry-meta-hide-on-mobile"]/text()')
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_value("page_url", "-")
        item_loader.add_xpath("comment_nums", '//a[@href="#article-comment"]/span/text()')
        item_loader.add_xpath("praise_nums",
                              '//span[@class=" btn-bluet-bigger href-style vote-post-up   register-user-only "]/h10/text()')
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        item_loader.add_xpath("content", '//div[@class="entry"]/text()')
        article_item = item_loader.load_item()

        yield article_item
예제 #19
0
    def parse_zl(self, response):
        LaGouArticleItem = ArticleItemLoader(item=ZhiLianItem(), response=response)
        LaGouArticleItem.add_css("job_name", '.fixed-inner-box h1::text')
        LaGouArticleItem.add_xpath("salary", "//div[@class='terminalpage-left']/ul/li[1]/strong/text()")
        LaGouArticleItem.add_xpath("job_exp", "//div[@class='terminalpage-left']/ul/li[5]/strong/text()")
        LaGouArticleItem.add_xpath("edu", "//div[@class='terminalpage-left']/ul/li[6]/strong/text()")
        LaGouArticleItem.add_xpath("job_type", "//div[@class='terminalpage-left']/ul/li[4]/strong/text()")
        LaGouArticleItem.add_xpath("work_city","//div[@class='terminalpage-left']/ul/li[2]/strong/a/text()")
        LaGouArticleItem.add_css("company_name",".inner-left a ::text")
        LaGouArticleItem.add_css("company_url",".inner-left a::attr(href)")
        LaGouArticleItem.add_css("work_addr",".terminalpage-main h2::text")
        #LaGouArticleItem.add_xpath("feedback","//div[@class='publisher_data']/div[2]/span[@class='tip']/i/text()")
        LaGouArticleItem.add_xpath("create_date","//div[@class='terminalpage-left']/ul/li[3]/strong")
        LaGouArticleItem.add_value("job_url", response.url)
        LaGouArticleItem.add_value("job_url_id",get_md5(response.url))
        LaGouArticleItem.add_css("job_advantage", ".welfare-tab-box ::text")
        LaGouArticleItem.add_xpath("job_desc","//div[@class='tab-inner-cont'][1]/p")
        LaGouArticleItem.add_xpath("tag","//div[@class='terminalpage-left']/ul/li[8]/strong/a/text()")
        ArticleItemLoder = LaGouArticleItem.load_item()

        return ArticleItemLoder
예제 #20
0
    def parse_detail(self, response):
        """
        提取文章信息
        """
        # 通过自定义的item_loader加载item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_xpath("title",
                              '//div[@class="entry-header"]/h1/text()')
        front_image_url = response.meta.get("front_image_url", "")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_xpath(
            "create_date", "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_xpath(
            "praise_nums",
            "//span[contains(@class, 'vote-post-up')]/h10/text()")
        item_loader.add_xpath("comment_nums",
                              "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath(
            "fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()")
        item_loader.add_xpath(
            "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        item_loader.add_xpath("content", "//div[@class='entry']")

        article_item = item_loader.load_item()

        yield article_item
예제 #21
0
    def parse_detail(self, response):
        # 使用item loader加载item
        front_image_url = response.meta.get('front_image_url', '')  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_xpath('title', '//*[@class="entry-header"]/h1/text()')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_xpath('create_date',
                              '//*[@class="entry-meta"]/p/text()[1]')
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_xpath('praise_nums', '//h10/text()')
        item_loader.add_xpath('collect_nums',
                              '//*[@class="post-adds"]/span[2]/text()')
        item_loader.add_xpath('comment_nums',
                              '//*[@class="post-adds"]/a[1]/span/text()')
        item_loader.add_xpath('content', '//*[@class="entry"]')
        item_loader.add_xpath('tags', '//div[@class="entry-meta"]/p/a/text()')

        article_item = item_loader.load_item()

        yield article_item
예제 #22
0
    def parse_detail(self,response):

        article_item = JobBoleArticleItem()

        #提取文章的具体字段
        # title = response.xpath("// *[ @ id = 'post-113652'] / div[1] / h1/text()").extract()[0]
        # 使用css样式选择器
        # front_image_url = response.meta.get("front_image_url","")#文章封面图
        # title = response.css(".entry-header h1::text").extract_first()
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().strip().replace("·","").strip()
        # #praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first()
        # praise_nums = response.css(".vote-post-up h10::text").extract_first()
        # # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0].replace("收藏",'').strip()
        # #使用正则表达式
        # fav_nums = response.css(".bookmark-btn::text").extract_first()
        # match_re = re.match(".*?(\d+).*",fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first().replace("评论","").strip()
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first()
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.xpath("//div[@class='entry']").extract_first()
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = list(set(tag_list))
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # try:
        #     create_date = datetime.datetime.strptime(create_date,"%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["url"] = response.url
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        #通过Item loader 加载Item(利于后期维护)
        # 1.默认都是生成list
        # 2.增加处理函数
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),response=response)
        item_loader.add_css("title",".entry-header h1::text")
        item_loader.add_value("url",response.url)
        item_loader.add_value("url_object_id",get_md5(response.url))
        item_loader.add_xpath("create_date","//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loader.add_value("front_image_url",[front_image_url])
        item_loader.add_css("praise_nums",".vote-post-up h10::text")
        item_loader.add_css("comment_nums","a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums",".bookmark-btn::text")
        item_loader.add_xpath("tags","//p[@class='entry-meta-hide-on-mobile']/a/text()")
        item_loader.add_xpath("content","//div[@class='entry']")

        article_item = item_loader.load_item()


        yield article_item
예제 #23
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()
        front_image_url = response.meta.get("front_image_url", "")

        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
        # create_date = response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(
        #     " ·", "")
        # praise_nums = response.xpath('//div/span[contains(@class,"vote-post-up")]/h10/text()').extract()[0]
        # fav_nums_string = response.xpath('//div/span[contains(@class,"bookmark-btn")]/text()').extract()[0]
        # match_re = re.match(r".*?(\d+).*", fav_nums_string)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums_string = response.xpath('//div/a[@href="#article-comment"]/span/text()').extract()[0]
        # match_re = re.match(r".*?(\d+).*", comment_nums_string)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.xpath('//div[@class="entry"]').extract()  #
        # tag_list_comment = response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tag_list = [element for element in tag_list_comment if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        #
        # try:
        #     create_date = datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.now().date()
        #
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_xpath("title",
                              '//div[@class="entry-header"]/h1/text()')
        item_loader.add_xpath(
            "create_date",
            '//div/p[@class="entry-meta-hide-on-mobile"]/text()')
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url", front_image_url)
        # item_loader.add_xpath("front_image_path",)
        item_loader.add_xpath(
            "praise_nums",
            '//div/span[contains(@class,"vote-post-up")]/h10/text()')
        item_loader.add_xpath(
            "fav_nums", '//div/span[contains(@class,"bookmark-btn")]/text()')
        item_loader.add_xpath("comment_nums",
                              '//div/a[@href="#article-comment"]/span/text()')
        item_loader.add_xpath("content", '//div[@class="entry"]')
        item_loader.add_xpath(
            "tags", '//div/p[@class="entry-meta-hide-on-mobile"]/a/text()')

        article_item = item_loader.load_item()

        yield article_item
예제 #24
0
    def parse_detail(self, response):
        article_item = JobbboleItem()
        if (response.url == 'http://blog.jobbole.com/all-posts/'):
            pass
        # url = response.url
        # title = response.xpath('//*[@class="entry-header"]/h1/text()').extract()[0]
        # create_date = response.xpath('//*[@class="entry-meta"]/p/text()').extract()[0].strip().replace("·","").strip()
        # praise_nums = int(response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0])
        # book_mark = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0]
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        # match_re = re.match(r".*(\d+).*",book_mark)
        # match_com = re.match(r".*(\d).*",comment_nums)
        # content = response.xpath("//*[@class='entry']/p/text()").extract()[0]
        # if match_re:
        #     book_mark = int(match_re.group(1))
        # else:
        #     book_mark = 0
        # if match_com:
        #     comment_nums = int(match_com.group(1))
        # else:
        #     comment_nums = 0
        # article_item['title'] = title
        # """
        # 将字符串日期格式化为date类型
        # """
        # try:
        #     create_date = datetime.datetime.strftime(create_date,"%Y/%m/%d").date()
        # except Exception as er:
        #     create_date = datetime.datetime.now().date()
        # article_item['create_date'] = create_date
        # article_item['praise_nums'] = praise_nums
        # article_item['book_mark'] = book_mark
        # article_item['comment_nums'] = comment_nums
        # article_item['content'] = content
        # article_item['url'] = url

        #通过itemLoader加载 item
        item_loader = ArticleItemLoader(item=JobbboleItem(), response=response)
        item_loader.add_xpath("title", '//*[@class="entry-header"]/h1/text()')
        item_loader.add_value("url", response.url)
        item_loader.add_xpath("create_date",
                              '//*[@class="entry-meta"]/p/text()')
        item_loader.add_xpath(
            "praise_nums",
            "//span[contains(@class,'vote-post-up')]/h10/text()")
        item_loader.add_xpath(
            "book_mark", "//span[contains(@class,'bookmark-btn')]/text()")
        item_loader.add_xpath("comment_nums",
                              "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath("content", "//*[@class='entry']/p/text()")
        article_item = item_loader.load_item()
        yield article_item
예제 #25
0
    def parse_detail(self, response):

        article_item = JobBoleArticleItem()

        #提取文章的具体字段
        front_image_url = response.meta.get('front_image_url', '')
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace("·", "").strip()
        # praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract()[0]
        #
        # fav_nums = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract()[0]
        # match_re = re.match('.*?(\d+).*', fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0]
        # match_re = re.match('.*?(\d+).*', comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.xpath('//div[@class="entry"]').extract()
        # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith('评论')]
        # tags = ','.join(tag_list)
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['title'] = title
        # article_item['url'] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [front_image_url]
        # article_item['praise_nums'] = praise_nums
        # article_item['comment_nums'] = comment_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['tags'] = tags
        # article_item['content'] = content

        #通过item_loader加载item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_xpath('title',
                              '//div[@class="entry-header"]/h1/text()')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_xpath(
            'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()')
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_xpath(
            'praise_nums',
            '//span[contains(@class, "vote-post-up")]/h10/text()')
        item_loader.add_xpath('comment_nums',
                              '//a[@href="#article-comment"]/span/text()')
        item_loader.add_xpath(
            'fav_nums', '//span[contains(@class, "bookmark-btn")]/text()')
        item_loader.add_xpath(
            'tags', '//p[@class="entry-meta-hide-on-mobile"]/a/text()')
        item_loader.add_xpath('content', '//div[@class="entry"]')

        article_item = item_loader.load_item()

        yield article_item
예제 #26
0
    def parse_detail(self, response):
        """
        爬取数据
        """
        # front_image_url = response.meta.get("font_image_url", "")
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first("")  # 标题
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "").strip()    # 日期
        # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]    # 点赞数
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]   # 收藏人数
        # match_re = re.match(".*?(\d+).*", fav_nums)
        #
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # 评论人数
        # match_re = re.match(".*?(\d+).*]", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.xpath("//div[@class='entry']").extract()[0]  # 日期
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list= [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ','.join(tag_list)   # 标签
        #
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.strftime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.now().date()
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        article_item = JobboleArticleItem()
        # 通过itemload加载item
        front_image_url = response.meta.get("font_image_url", "")
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response)
        item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_xpath("create_date", "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_xpath("praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()")
        item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath("fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()")
        item_loader.add_xpath("content", "//div[@class='entry']")
        item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        article_item= item_loader.load_item()
        yield article_item
예제 #27
0
    def parse_detail(self, response):
        # 提取文章的具体字段

        # front_image_url = response.meta.get("front_image_url", "")  #文章封面图
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first()
        #
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().replace("·", "").strip()
        # try:
        #     create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d')
        # except Exception as e:
        #     create_date = datetime.datetime.now()
        #
        # praise_nums = int(response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first())
        #
        # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract_first()
        # match_re = re.match(r".*(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first()
        # match_re = re.match(r".*(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # # content = response.xpath("//div[@class='entry']").extract_first()
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item = JobboleArticleItem()
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        # 通过ItemLoader加载item
        front_image_url = response.meta.get("front_image_url", "")  #文章封面图

        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)
        item_loader.add_xpath('title',
                              "//div[@class='entry-header']/h1/text()")
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_xpath(
            'create_date', "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_xpath(
            'praise_nums',
            "//span[contains(@class,'vote-post-up')]/h10/text()")
        item_loader.add_xpath('comment_nums',
                              "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath(
            'fav_nums', "//span[contains(@class,'bookmark-btn')]/text()")
        item_loader.add_xpath(
            'tags', "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        # item_loader.add_xpath('content', "//div[@class='entry']")
        article_item = item_loader.load_item()

        yield article_item
예제 #28
0
    def parse_detail(self, response):
        image_url = response.meta.get("image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_xpath("title", "//h1/text()")
        item_loader.add_xpath(
            "publish_time", "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("image_url", [image_url])
        item_loader.add_xpath(
            "zan_nums", "//span[contains(@class,'vote-post-up')]/h10/text()")
        item_loader.add_xpath("collect_nums",
                              "//span[@data-site-id='2']/text()")
        item_loader.add_xpath("comment_nums",
                              "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath("content", "//div[@class='entry']")
        item_loader.add_xpath(
            "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")

        article = item_loader.load_item()
        yield article
예제 #29
0
    def parse_detail(self, response):
        # article_item = JobBoleArticleItem()
        #提取文章的具体字段
        # front_image_url = response.meta.get("front-img-url", "")  #文章封面图url
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first(default="")
        # creat_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().strip().replace("·","").strip()
        # praise_num = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract_first()
        # collect_num = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first()
        # match_re = re.match(".*?(\d+).*", collect_num)
        # if match_re:
        #     collect_num = int(match_re.group(1))
        # else:
        #     collect_num = 0
        #
        # comment_num = response.css(".btn-bluet-bigger.href-style.hide-on-480::text").extract_first()
        # match_re = re.match(".*?(\d+).*", comment_num)
        # if match_re:
        #     comment_num = int(match_re.group(1))
        # else:
        #     comment_num = 0
        #
        # content = response.xpath("//div[@class='entry']").extract_first()
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # article_item["front_image_url"] = [front_image_url]
        # try:
        #     creat_date = datetime.strptime(creat_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     creat_date = datetime.now().date()
        # article_item["creat_date"] = creat_date
        # article_item["praise_num"] = praise_num
        # article_item["collect_num"] = collect_num
        # article_item["comment_num"] = comment_num
        # article_item["content"] = content
        # article_item["tags"] = tags

        # 通过ItemLoder加载item
        front_image_url = response.meta.get("front-img-url", "")  # 文章封面图url
        item_loder = ArticleItemLoader(item=JobBoleArticleItem(),
                                       response=response)
        item_loder.add_xpath("title", "//div[@class='entry-header']/h1/text()")
        item_loder.add_xpath("creat_date",
                             "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loder.add_xpath(
            "praise_num",
            "//span[contains(@class, 'vote-post-up')]/h10/text()")
        item_loder.add_xpath(
            "collect_num", "//span[contains(@class, 'bookmark-btn')]/text()")
        item_loder.add_css("comment_num",
                           ".btn-bluet-bigger.href-style.hide-on-480::text")
        item_loder.add_xpath("content", "//div[@class='entry']")
        item_loder.add_xpath(
            "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        item_loder.add_value("url", response.url)
        item_loder.add_value("url_object_id", get_md5(response.url))
        item_loder.add_value("front_image_url", [front_image_url])

        article_item = item_loder.load_item()

        #传到pipeline
        yield article_item
예제 #30
0
    def parse_detail(self, response):
        # Request的回调函数
        # 提取文章的具体字段

        # article_item = JobBoleArticleItem()

        # front_image_url = response.meta.get("front_image_url", "") # 获取文章封面图,用get方法不会抛异常
        # title = response.xpath("//*[@class='entry-header']/h1/text()").extract_first("")
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first("").strip()[:10]
        # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract_first("")
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first("")
        # match_fav = re.match(".*?(\d+).*", fav_nums)
        # if match_fav:
        #     fav_nums = int(match_fav.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first("")
        # match_com = re.match(".*?(\d+).*", comment_nums)
        # if match_com:
        #     comment_nums = int(match_com.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.xpath("//div[@class='entry']").extract_first("")
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url) # 将url进行md5编码保存
        # article_item["title"] = title
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()  # 转换成日期
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()  # 获取当前如期
        # article_item["create_date"] = create_date
        # article_item["url"] = response.url
        # article_item["front_image_url"] = [front_image_url]  # 下载图片需要数组
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["content"] = content
        # article_item["tags"] = tags


        # 通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "") # 获取文章封面图,用get方法不会抛异常
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_xpath("title", "//*[@class='entry-header']/h1/text()")
        item_loader.add_xpath("create_date", "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loader.add_value("url", response.url)
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_value("url_object_id", response.url)
        item_loader.add_xpath("praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()")
        item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath("fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()")
        item_loader.add_xpath("content", "//div[@class='entry']")
        item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")

        article_item = item_loader.load_item()


        # 传递到pipelines.py
        yield article_item