Ejemplo n.º 1
0
    def parse_job(self, response):

        item_loader=LagouJobItemLoader(item=LagouJobItem(),response=response)
        item_loader.add_value("url",response.url)
        item_loader.add_value("url_object_id",get_md5(response.url))
        item_loader.add_css("title",'.job-name::attr("title")')
        salary=response.css('.job_request .salary::text').extract_first()
        item_loader.add_value('salary_min',get_salary(salary).__getitem__(0))
        item_loader.add_value('salary_max',get_salary(salary).__getitem__(1))
        item_loader.add_xpath('job_city','//*[@class="job_request"]/p/span[2]/text()')
        years=response.xpath('//*[@class="job_request"]/p/span[3]/text()').extract_first()
        item_loader.add_value('work_years_min',get_year(years).__getitem__(0))
        item_loader.add_value('work_years_max',get_year(years).__getitem__(1))
        item_loader.add_xpath('degree_need','//*[@class="job_request"]/p/span[4]/text()')
        item_loader.add_xpath('job_time','//*[@class="job_request"]/p/span[5]/text()')
        item_loader.add_css('publish_time','.publish_time::text')
        item_loader.add_xpath('tags','//*[@class="position-label clearfix"]/li/text()')
        item_loader.add_xpath('job_advantage','//*[@class="job-advantage"]/p/text()')
        item_loader.add_css('job_desc','.job_bt div')
        item_loader.add_css('job_addr','.work_addr')
        item_loader.add_css('company','#job_company dt a img::attr(alt)')
        item_loader.add_css('company_url','#job_company dt a::attr(href)')
        item_loader.add_value('crawl_time',datetime.now())

        jobItem=item_loader.load_item()

        return jobItem
Ejemplo n.º 2
0
 def parse_detail(self, response):
     # 通过ItemLoader来加载item
     article_item = JobBoleArticleItem()
     front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
     item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                     response=response)
     item_loader.add_css("title", ".entry-header h1::text")
     item_loader.add_value("url", response.url)
     item_loader.add_value("url_object_id", get_md5(response.url))
     # item_loader.add_xpath("create_date","//p[@class='entry-meta-hide-on-mobile']/text()")
     item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
     item_loader.add_value("front_image_url", [front_image_url])
     item_loader.add_xpath(
         "thumbsUp_nums",
         "//span[contains(@class,'vote-post-up')]/h10/text()")
     item_loader.add_xpath("comment_nums",
                           "//a[@href='#article-comment']/span/text()")
     item_loader.add_xpath(
         "fav_nums", "//span[contains(@class,'bookmark-btn')]/text()")
     item_loader.add_xpath(
         "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")
     item_loader.add_xpath("content", "//div[@class='entry']")
     article_item = item_loader.load_item()
     yield article_item
     pass
Ejemplo n.º 3
0
    def parse_job(self, response):
        #解析拉勾网的职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name span::text")
        item_loader.add_value("url", response.url)
        item_loader.add_css("salary", ".salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_css("company_name", "#job_company dt a div h2::text")
        item_loader.add_value("crawl_time", datetime.datetime.now())
        item_loader.add_value("url_object_id", get_md5(response.url))

        job_item = item_loader.load_item()

        return job_item
Ejemplo n.º 4
0
    def parse_detail(self, response):
        # 通过css选择器提取字段

        # title = response.css(".entry-header h1::text").extract()[0]
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip()
        # praise_nums = response.css(".vote-post-up h10::text").extract()[0]
        # fav_nums = response.css(".bookmark-btn::text").extract()[0]
        # match_re = re.match(".*?(\d\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re = re.match(".*?(\d\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.css("div.entry").extract()[0]
        #
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        #  item loader 加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("fav_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("comment_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "a[href='#article-comment'] span::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Ejemplo n.º 5
0
    def parse_detail(self, response):

        item_loader = ImportNewItemLoader(item=InputNewItem(),
                                          response=response)

        item_loader.add_xpath('title', '//*[@class="entry-header"]/h1/text()')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_id', get_md5(response.url))
        #待提取
        item_loader.add_xpath(
            'update_time', '//*[@class="entry-meta-hide-on-mobile"]/text()')
        item_loader.add_xpath(
            'category', '//*[@class="entry-meta-hide-on-mobile"]/a[1]/text()')
        #从第2个 列表项开始
        item_loader.add_css('tags', '.entry-meta-hide-on-mobile')
        item_loader.add_css('content', '.entry')

        importNewItem = item_loader.load_item()

        yield importNewItem
Ejemplo n.º 6
0
 def parse_detail(self, response):
     item = ArticleItem()
     item['url_object_id'] = get_md5(response.url)
     item['front_image_url'] = [response.meta.get('front_image_url', '')]
     item['post_url'] = response.url
     item['description'] = response.meta.get('description', '')  #默认为空
     item['title'] = response.xpath(
         '//div[@class="entry-header"]/h1/text()').extract()[0]
     item['date'] = response.xpath(
         '//p[@class="entry-meta-hide-on-mobile"]/text()').extract(
         )[0].strip().replace('·', '').strip()
     item['category'] = response.xpath(
         '//p[@class="entry-meta-hide-on-mobile"]/a[@rel="category tag"]/text()'
     ).extract()[0]
     fav_path = '//span[contains(@class, "vote-post-up")]/h10/text()'
     item['fav_num'] = 0 if not response.xpath(fav_path).re('\d+') else int(
         response.xpath(fav_path).re('\d+')[0])
     collections_path = '//span[@class=" btn-bluet-bigger href-style bookmark-btn  register-user-only "]/text()'
     item['collections'] = 0 if not response.xpath(collections_path).re(
         '\d+') else int(response.xpath(collections_path).re('\d+')[0])
     comment_path = '//span[@class="btn-bluet-bigger href-style hide-on-480"]/text()'
     item['comment'] = 0 if not response.xpath(comment_path).re(
         '\d+') else int(response.xpath(comment_path).re('\d+')[0])
     yield item
Ejemplo n.º 7
0
    def parse_detail(self, response):
        """
            提取文件具体字段
        :param response: 
        :return: 
        """

        # jobble_item=JobboleArticleItem()
        # #图片URL
        # front_image_url=response.meta.get("front_image_url","")
        # title = response.xpath('//*[@class="entry-header"]/h1/text()').extract_first("")
        # create_date = response.xpath('//*[@class="entry-meta"]/p[1]/text()').extract()[0].strip().replace("·","").strip()
        # content=response.xpath('//div[@class="entry"]').extract()[0]
        # #文章标签
        # classfiy_list=response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # classfiy_list = [tag for tag in classfiy_list if not tag.strip().endswith("评论")]
        # tags = (",").join(classfiy_list)
        # #赞数
        # zan=int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0])
        # #评论
        # remark=response.xpath('//span[contains(@class,"btn-bluet-bigger")]/text()').extract()[3]
        # match_re=re.match(".*?(\d+).*",remark)
        # if match_re:
        #     remark=int(match_re.group(1))
        # else:
        #     remark=0
        # #收藏
        # collect=response.xpath('//span[contains(@class,"btn-bluet-bigger")]/text()').extract()[2]
        # match_re = re.match(".*?(\d+).*", collect)
        # if match_re:
        #     collect =int(match_re.group(1))
        # else:
        #     collect=0
        #
        # # item赋值
        # jobble_item["title"]=title
        # jobble_item["url"]=response.url
        # jobble_item["url_object_id"]=get_md5(response.url)
        # try:
        #     create_date = datetime.datetime.strftime(create_date,"%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # jobble_item["create_date"]=create_date
        # jobble_item["front_image_url"]=[front_image_url]
        # jobble_item["zan"]=zan
        # jobble_item["collect"]=collect
        # jobble_item["remark"]=remark
        # jobble_item["tags"]=tags
        # jobble_item["content"]=content
        # #jobble_item["autor"]=autor

        #通过Itemloader加载item
        #item_loader=ItemLoader(item=JobboleArticleItem(),response=response)

        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = AriticleItemLoader(item=JobboleArticleItem(),
                                         response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("zan", ".vote-post-up h10::text")
        item_loader.add_css("remark", "a[href='#article-comment'] span::text")
        item_loader.add_css("collect", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        jobble_item = item_loader.load_item()

        yield jobble_item
Ejemplo n.º 8
0
    def parse_detail(self, response):
        # item = JobBoleArticleItem()
        # title=response.css('.entry-header h1::text').extract_first('')
        # create_date=response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].replace("·","").strip()
        # praise_nums=response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]
        # if praise_nums:
        #     praise_nums = int(praise_nums)
        # else:
        #     praise_nums = 0
        #
        # #收藏
        # fav_nums=response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0]
        # match_re=re.match(".*?(\d+).*",fav_nums)
        # if match_re:
        #     fav_nums=int(match_re.group(1))
        # else:
        #     fav_nums=0
        #
        # #评论
        # comments_nums=response.xpath("//a[@href='#article-comment']/span/text()").extract()[0].replace('评论','').strip()
        # if comments_nums:
        #     comments_nums = int(comments_nums)
        # else:
        #     comments_nums = 0
        #
        # #正文
        # content=response.xpath("//div[@class='grid-8']").extract()[0]
        #
        # #标签
        # tag = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tag=[element for element in tag if not element.strip().endswith('评论')]
        # tag='-'.join(tag)
        #
        # item['title']=title
        # try:
        #     create_date = datetime.datetime.strptime(create_date,'%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # item['create_date'] = create_date
        # item['praise_nums'] = praise_nums
        # item['fav_nums'] = fav_nums
        # item['comments_nums'] = comments_nums
        # item['content'] = content
        # item['tag'] = tag
        # item['url'] = response.url
        # item['url_object_id'] = get_md5(response.url)
        # item['front_image_url'] = [front_image_url]  #图片下载url应该为list类型

        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        #通过item Loader加载item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_xpath(
            'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()')
        item_loader.add_xpath(
            'praise_nums',
            "//span[contains(@class,'vote-post-up')]/h10/text()")
        item_loader.add_xpath(
            'fav_nums', "//span[contains(@class,'bookmark-btn')]/text()")
        item_loader.add_xpath('comments_nums',
                              "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath('content', "//div[@class='grid-8']")
        item_loader.add_xpath(
            'tag', '//p[@class="entry-meta-hide-on-mobile"]/a/text()')

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value('front_image_url', front_image_url)
        item_loader.add_value('front_image_path', "none")

        item = item_loader.load_item()

        yield item
Ejemplo n.º 9
0
    def parse_detail(self, response):
        item = JobboleArticleItem()
        # 提取目标数据
        # front_img_url = response.meta["front_img_url"]
        front_image_url = response.meta.get("front_image_url",
                                            "")  # 文章封面图的URL,加入get方法,默认返回空值
        title = response.css('div.entry-header h1::text').extract()[0]
        release_date = response.css(
            'p.entry-meta-hide-on-mobile ::text').extract()[0].replace(
                ' ·', '').strip()
        tag = response.css('p.entry-meta-hide-on-mobile a::text').extract()
        tags = ','.join(tag)
        voteup_num = int(
            response.css('span.vote-post-up h10::text').extract()[0])
        collection_num = response.css('span.bookmark-btn::text').extract()[0]
        collection_pattern = re.match('.*?(\d+).*', collection_num)
        if collection_pattern:
            collection_num = int(collection_pattern.group(1))
        else:
            collection_num = 0

        comment_num = response.css(
            'a[href="#article-comment"] span::text').extract()[0]
        comment_pattern = re.match('.*?(\d+).*', comment_num)
        if comment_pattern:
            comment_num = int(comment_pattern.group(1))
        else:
            comment_num = 0

        content = response.css('div.entry').extract()[0]

        item["url_object_id"] = get_md5(response.url)
        item['front_image_url'] = [front_image_url]
        item['title'] = title
        item['url'] = response.url
        try:
            release_date = datetime.datetime.strftime(release_date,
                                                      '%Y/%m/%d').date()
        except Exception as e:
            release_date = datetime.datetime.now().date()
        item['release_date'] = release_date
        item['tags'] = tags
        item['voteup_num'] = voteup_num
        item['collection_num'] = collection_num
        item['comment_num'] = comment_num
        item['content'] = content

        # 通过item_loader加载item
        item_loader = ItemLoader(item=JobboleArticleItem(), response=response)
        item_loader.add_css("title", "div.entry-header h1::text")
        # item_loader.add_xpath()
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("release_date",
                            "p.entry-meta-hide-on-mobile ::text")
        item_loader.add_css("tag", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("voteup_num", "span.vote-post-up h10::text")
        item_loader.add_css("collection_num", "span.bookmark-btn::text")
        item_loader.add_css("comment_num",
                            'a[href="#article-comment"] span::text')
        item_loader.add_css("content", "div.entry")

        yield item