def parse_job(self, response): item_loader=LagouJobItemLoader(item=LagouJobItem(),response=response) item_loader.add_value("url",response.url) item_loader.add_value("url_object_id",get_md5(response.url)) item_loader.add_css("title",'.job-name::attr("title")') salary=response.css('.job_request .salary::text').extract_first() item_loader.add_value('salary_min',get_salary(salary).__getitem__(0)) item_loader.add_value('salary_max',get_salary(salary).__getitem__(1)) item_loader.add_xpath('job_city','//*[@class="job_request"]/p/span[2]/text()') years=response.xpath('//*[@class="job_request"]/p/span[3]/text()').extract_first() item_loader.add_value('work_years_min',get_year(years).__getitem__(0)) item_loader.add_value('work_years_max',get_year(years).__getitem__(1)) item_loader.add_xpath('degree_need','//*[@class="job_request"]/p/span[4]/text()') item_loader.add_xpath('job_time','//*[@class="job_request"]/p/span[5]/text()') item_loader.add_css('publish_time','.publish_time::text') item_loader.add_xpath('tags','//*[@class="position-label clearfix"]/li/text()') item_loader.add_xpath('job_advantage','//*[@class="job-advantage"]/p/text()') item_loader.add_css('job_desc','.job_bt div') item_loader.add_css('job_addr','.work_addr') item_loader.add_css('company','#job_company dt a img::attr(alt)') item_loader.add_css('company_url','#job_company dt a::attr(href)') item_loader.add_value('crawl_time',datetime.now()) jobItem=item_loader.load_item() return jobItem
def parse_detail(self, response): # 通过ItemLoader来加载item article_item = JobBoleArticleItem() front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) # item_loader.add_xpath("create_date","//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath( "thumbsUp_nums", "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath( "fav_nums", "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath( "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loader.add_xpath("content", "//div[@class='entry']") article_item = item_loader.load_item() yield article_item pass
def parse_job(self, response): #解析拉勾网的职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name span::text") item_loader.add_value("url", response.url) item_loader.add_css("salary", ".salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_css("company_name", "#job_company dt a div h2::text") item_loader.add_value("crawl_time", datetime.datetime.now()) item_loader.add_value("url_object_id", get_md5(response.url)) job_item = item_loader.load_item() return job_item
def parse_detail(self, response): # 通过css选择器提取字段 # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*?(\d\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # item loader 加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("fav_nums", "a[href='#article-comment'] span::text") item_loader.add_css("comment_nums", ".bookmark-btn::text") item_loader.add_css("tags", "a[href='#article-comment'] span::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): item_loader = ImportNewItemLoader(item=InputNewItem(), response=response) item_loader.add_xpath('title', '//*[@class="entry-header"]/h1/text()') item_loader.add_value('url', response.url) item_loader.add_value('url_id', get_md5(response.url)) #待提取 item_loader.add_xpath( 'update_time', '//*[@class="entry-meta-hide-on-mobile"]/text()') item_loader.add_xpath( 'category', '//*[@class="entry-meta-hide-on-mobile"]/a[1]/text()') #从第2个 列表项开始 item_loader.add_css('tags', '.entry-meta-hide-on-mobile') item_loader.add_css('content', '.entry') importNewItem = item_loader.load_item() yield importNewItem
def parse_detail(self, response): item = ArticleItem() item['url_object_id'] = get_md5(response.url) item['front_image_url'] = [response.meta.get('front_image_url', '')] item['post_url'] = response.url item['description'] = response.meta.get('description', '') #默认为空 item['title'] = response.xpath( '//div[@class="entry-header"]/h1/text()').extract()[0] item['date'] = response.xpath( '//p[@class="entry-meta-hide-on-mobile"]/text()').extract( )[0].strip().replace('·', '').strip() item['category'] = response.xpath( '//p[@class="entry-meta-hide-on-mobile"]/a[@rel="category tag"]/text()' ).extract()[0] fav_path = '//span[contains(@class, "vote-post-up")]/h10/text()' item['fav_num'] = 0 if not response.xpath(fav_path).re('\d+') else int( response.xpath(fav_path).re('\d+')[0]) collections_path = '//span[@class=" btn-bluet-bigger href-style bookmark-btn register-user-only "]/text()' item['collections'] = 0 if not response.xpath(collections_path).re( '\d+') else int(response.xpath(collections_path).re('\d+')[0]) comment_path = '//span[@class="btn-bluet-bigger href-style hide-on-480"]/text()' item['comment'] = 0 if not response.xpath(comment_path).re( '\d+') else int(response.xpath(comment_path).re('\d+')[0]) yield item
def parse_detail(self, response): """ 提取文件具体字段 :param response: :return: """ # jobble_item=JobboleArticleItem() # #图片URL # front_image_url=response.meta.get("front_image_url","") # title = response.xpath('//*[@class="entry-header"]/h1/text()').extract_first("") # create_date = response.xpath('//*[@class="entry-meta"]/p[1]/text()').extract()[0].strip().replace("·","").strip() # content=response.xpath('//div[@class="entry"]').extract()[0] # #文章标签 # classfiy_list=response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # classfiy_list = [tag for tag in classfiy_list if not tag.strip().endswith("评论")] # tags = (",").join(classfiy_list) # #赞数 # zan=int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0]) # #评论 # remark=response.xpath('//span[contains(@class,"btn-bluet-bigger")]/text()').extract()[3] # match_re=re.match(".*?(\d+).*",remark) # if match_re: # remark=int(match_re.group(1)) # else: # remark=0 # #收藏 # collect=response.xpath('//span[contains(@class,"btn-bluet-bigger")]/text()').extract()[2] # match_re = re.match(".*?(\d+).*", collect) # if match_re: # collect =int(match_re.group(1)) # else: # collect=0 # # # item赋值 # jobble_item["title"]=title # jobble_item["url"]=response.url # jobble_item["url_object_id"]=get_md5(response.url) # try: # create_date = datetime.datetime.strftime(create_date,"%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # jobble_item["create_date"]=create_date # jobble_item["front_image_url"]=[front_image_url] # jobble_item["zan"]=zan # jobble_item["collect"]=collect # jobble_item["remark"]=remark # jobble_item["tags"]=tags # jobble_item["content"]=content # #jobble_item["autor"]=autor #通过Itemloader加载item #item_loader=ItemLoader(item=JobboleArticleItem(),response=response) front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = AriticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("zan", ".vote-post-up h10::text") item_loader.add_css("remark", "a[href='#article-comment'] span::text") item_loader.add_css("collect", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") jobble_item = item_loader.load_item() yield jobble_item
def parse_detail(self, response): # item = JobBoleArticleItem() # title=response.css('.entry-header h1::text').extract_first('') # create_date=response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].replace("·","").strip() # praise_nums=response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0] # if praise_nums: # praise_nums = int(praise_nums) # else: # praise_nums = 0 # # #收藏 # fav_nums=response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0] # match_re=re.match(".*?(\d+).*",fav_nums) # if match_re: # fav_nums=int(match_re.group(1)) # else: # fav_nums=0 # # #评论 # comments_nums=response.xpath("//a[@href='#article-comment']/span/text()").extract()[0].replace('评论','').strip() # if comments_nums: # comments_nums = int(comments_nums) # else: # comments_nums = 0 # # #正文 # content=response.xpath("//div[@class='grid-8']").extract()[0] # # #标签 # tag = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag=[element for element in tag if not element.strip().endswith('评论')] # tag='-'.join(tag) # # item['title']=title # try: # create_date = datetime.datetime.strptime(create_date,'%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # item['create_date'] = create_date # item['praise_nums'] = praise_nums # item['fav_nums'] = fav_nums # item['comments_nums'] = comments_nums # item['content'] = content # item['tag'] = tag # item['url'] = response.url # item['url_object_id'] = get_md5(response.url) # item['front_image_url'] = [front_image_url] #图片下载url应该为list类型 front_image_url = response.meta.get("front_image_url", "") # 文章封面图 #通过item Loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_xpath( 'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()') item_loader.add_xpath( 'praise_nums', "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath( 'fav_nums', "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath('comments_nums', "//a[@href='#article-comment']/span/text()") item_loader.add_xpath('content', "//div[@class='grid-8']") item_loader.add_xpath( 'tag', '//p[@class="entry-meta-hide-on-mobile"]/a/text()') item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value('front_image_url', front_image_url) item_loader.add_value('front_image_path', "none") item = item_loader.load_item() yield item
def parse_detail(self, response): item = JobboleArticleItem() # 提取目标数据 # front_img_url = response.meta["front_img_url"] front_image_url = response.meta.get("front_image_url", "") # 文章封面图的URL,加入get方法,默认返回空值 title = response.css('div.entry-header h1::text').extract()[0] release_date = response.css( 'p.entry-meta-hide-on-mobile ::text').extract()[0].replace( ' ·', '').strip() tag = response.css('p.entry-meta-hide-on-mobile a::text').extract() tags = ','.join(tag) voteup_num = int( response.css('span.vote-post-up h10::text').extract()[0]) collection_num = response.css('span.bookmark-btn::text').extract()[0] collection_pattern = re.match('.*?(\d+).*', collection_num) if collection_pattern: collection_num = int(collection_pattern.group(1)) else: collection_num = 0 comment_num = response.css( 'a[href="#article-comment"] span::text').extract()[0] comment_pattern = re.match('.*?(\d+).*', comment_num) if comment_pattern: comment_num = int(comment_pattern.group(1)) else: comment_num = 0 content = response.css('div.entry').extract()[0] item["url_object_id"] = get_md5(response.url) item['front_image_url'] = [front_image_url] item['title'] = title item['url'] = response.url try: release_date = datetime.datetime.strftime(release_date, '%Y/%m/%d').date() except Exception as e: release_date = datetime.datetime.now().date() item['release_date'] = release_date item['tags'] = tags item['voteup_num'] = voteup_num item['collection_num'] = collection_num item['comment_num'] = comment_num item['content'] = content # 通过item_loader加载item item_loader = ItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css("title", "div.entry-header h1::text") # item_loader.add_xpath() item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("release_date", "p.entry-meta-hide-on-mobile ::text") item_loader.add_css("tag", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("voteup_num", "span.vote-post-up h10::text") item_loader.add_css("collection_num", "span.bookmark-btn::text") item_loader.add_css("comment_num", 'a[href="#article-comment"] span::text') item_loader.add_css("content", "div.entry") yield item