def parse_question(self, response): question_loader = ArticleItemLoader(item=V2exQuItem(), response=response) question_loader.add_xpath("title", "//div[@class='header']/h1/text()") content="" mar_content = response.xpath("//div[@class='markdown_body']").extract() if len(mar_content)==0: content="".join(response.xpath("//div[@class='topic_content']").extract()).replace("\n","") else: content="".join(mar_content).replace("\n","") match_re1 = re.match(self.content_rule, content) if match_re1: question_loader.add_value("content",match_re1.group(1)) comment_count=response.xpath("//div[@class='cell']/span[@class='gray']/text()").extract() if len(comment_count)==0: question_loader.add_value("comment_count",0) else: match_re2 = re.match(self.comment_rule, comment_count[0]) if match_re2: question_loader.add_value("comment_count", match_re2.group(1)) question_loader.add_value("user_id",random.randint(2,14)) question_loader.add_value("created_date",time.time()) question_item=question_loader.load_item() yield question_item pass
def parse_detail(self, response): # article_item = JobboleArticleItem() # 提取文章的具体字段 # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', '') # fav_nums = response.xpath('//div[@class="post-adds"]/span[2]/h10/text()').extract_first() # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first() # content = response.xpath('//div[@class="entry"]').extract()[0] # tag_list = response.xpath('//div[@class="entry-meta"]/p/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content # 通过ItemLoader加载item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_xpath( 'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()') front_image_url = response.meta.get("front_image_url", "") item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_xpath('comment_nums', '//a[@href="#article-comment"]/span/text()') fav_nums = response.xpath( '//div[@class="post-adds"]/span[2]/h10/text()').extract_first() if fav_nums is None: fav_nums = '0' item_loader.add_value('fav_nums', fav_nums) item_loader.add_xpath('tags', '//div[@class="entry-meta"]/p/a/text()') item_loader.add_xpath('content', '//div[@class="entry"]') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # 标题 title = response.xpath('//div[@class="entry-header"]/h1/text()').extract() # 时间 crttime_content = response.xpath('//div[@class="entry-meta"]/p/text()').extract() if len(crttime_content) == 0: create_time = 'no' else: create_time = crttime_content[0].replace('·', '').strip() # 文章类别 article_kind_content = response.xpath('//div[@class="entry-meta"]/p/a/text()').extract() if len(article_kind_content) == 0: article_kind = 0 else: article_kind = article_kind_content[0] # 点赞数 praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # 收藏数 fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] match_re = re.match(".*(\d+).*",fav_nums) if match_re: fav_nums = match_re.group(1) else: fav_nums = 0 # 评论数 commant_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] match_re = re.match(".*(\d+).*", commant_nums) if match_re: commant_nums = match_re.group(1) else: commant_nums = 0 #内容 # content = response.xpath("//div[@class='entry']").extract() # 作者姓名 author_name_content = response.xpath("//div[@id='author-bio']//a/text()").extract() if len(author_name_content) == 0: author_name = 'no' else: author_name = author_name_content[0] item_loader = ArticleItemLoader(item=ArticlespiderItem(), response=response) item_loader.add_value('url', response.url) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_value('create_time', [create_time]) item_loader.add_value('article_kind', [article_kind]) item_loader.add_value('praise_nums', [praise_nums]) item_loader.add_value('fav_nums', [fav_nums]) item_loader.add_value('commant_nums', [commant_nums]) #item_loader.add_value('content', [content]) item_loader.add_value('author_name', [author_name]) article_item = item_loader.load_item() yield article_item
def parse_content(self,response): item_loader = ArticleItemLoader(item=DouBanItem(), response=response) item_loader.add_value("url",response.url) item_loader.add_xpath("title","//div[@id='content']/h1/span[1]/text()") #item_loader.add_xpath("time","//div[@id='content']/h1/span[2]/text()") item_loader.add_xpath("director","//div[@id='info']/span[1]/span[2]/a/text()") #item_loader.add_xpath("area","//*[@id='info']/text()[8]") #item_loader.add_xpath("language","//*[@id='info']/text()[10]") item_loader.add_css("score","div.rating_self strong::text") item_loader.add_xpath("introduction","//span[@property='v:summary']/text()") item_loader.add_xpath("front_image_url","//*[@id='mainpic']/a/img/@src") infos=response.xpath("//*[@id='info']/text()").extract() info_list=[] for info in infos: match_re = re.match(self.info_rule, info.strip()) if match_re: info_list.append(match_re.group(1)) time=response.xpath("//div[@id='content']/h1/span[2]/text()").extract()[0] match_re = re.match(self.time_rule, time) if match_re: item_loader.add_value("time",match_re.group(1)) item_loader.add_value("area",info_list[0]) item_loader.add_value("language",info_list[1]) item_loader.add_value("nickname",info_list[2]) douban_item = item_loader.load_item() yield douban_item
def parse_job(self, response): LaGouArticleItem = ArticleItemLoader(item=LaGouItem(), response=response) LaGouArticleItem.add_css("job_name", '.job-name::attr(title)') LaGouArticleItem.add_css("salary", ".salary::text") LaGouArticleItem.add_xpath( "job_exp", "//dd[@class='job_request']/p/span[3]/text()") LaGouArticleItem.add_xpath( "edu", "//dd[@class='job_request']/p/span[4]/text()") LaGouArticleItem.add_xpath( "job_type", "//dd[@class='job_request']/p/span[5]/text()") LaGouArticleItem.add_xpath( "work_city", "//dd[@class='job_request']/p/span[2]/text()") LaGouArticleItem.add_css("company_name", "#job_company .b2::attr(alt)") LaGouArticleItem.add_css("company_url", ".job_company dt a::attr(href)") LaGouArticleItem.add_css("work_addr", ".work_addr") #LaGouArticleItem.add_xpath("feedback","//div[@class='publisher_data']/div[2]/span[@class='tip']/i/text()") LaGouArticleItem.add_css("create_date", ".publish_time::text") LaGouArticleItem.add_value("job_url", response.url) LaGouArticleItem.add_value("job_url_id", get_md5(response.url)) LaGouArticleItem.add_css("job_advantage", ".job-advantage p::text") LaGouArticleItem.add_css("job_desc", ".job_bt div") LaGouArticleItem.add_css("tag", ".position-label li") ArticleItemLoder = LaGouArticleItem.load_item() return ArticleItemLoder
def parse_job(self, response): # 解析拉勾网的职位 item_loader = ArticleItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']//span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']//span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']//span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']//span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_detail(self, response): front_end_url = response.meta["front_end_url"] # title = response.xpath("//div[@class='post']/div[@class='article']/h1[@class='title']/text()").extract()[0] # print(title) # print(get_md5(response.url)) # jianshu_item = JianShuArticlespiderItem() # # jianshu_item["url"] = response.url # jianshu_item["title"] = title # jianshu_item['front_image_url'] = [front_end_url] item = ArticleItemLoader(item=JianShuArticlespiderItem(), response=response) item.add_xpath("title", "//div[@class='post']/div[@class='article']/h1[@class='title']/text()") item.add_value("url", response.url) item.add_value("front_image_url", [front_end_url]) jianshu = item.load_item() yield jianshu
def parse_detail(self, response): # item = ArticleItem() image_url = response.meta.get('meta_1') # item['title'] = response.xpath('//h1/text()').extract_first() # item['image_url'] = [image_url] # item['url_object_id'] = get_md5(image_url) # item['image_path'] = '' # 通过itemloader 加载 item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath('title', '//h1/text()') item_loader.add_value('image_url', [image_url]) item_loader.add_value('url_object_id', get_md5(image_url)) item_loader.add_value('url_object_id', '') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) # title = response.css("#news_title a::text").extract_first("") # title = response.xpath("//*[@id='news_title']//a/text()").extract_first("") # create_date = response.css("#news_info .time::text").extract_first("") # create_date = response.xpath("//*[@id='news_info']//*[@class='time']/text()").extract_first("") # date_re = re.match(".*?(\d+.*)", create_date) # if date_re: # create_date = date_re.group(1) # content = response.css("#news_content").extract_first("") # content = response.xpath("//*[@id='news_content']").extract_first("") # tag_list = response.css(".news_tags a::text").extract() # tag_list = response.xpath("//*[@class='news_tags']//a/text()").extract() # tags = ",".join(tag_list) # 赋值 # article_item = CnblogsArticleItem() # article_item['title'] = title # article_item['create_date'] = create_date # article_item['content'] = content # article_item['tags'] = tags # article_item['url'] = response.url # if response.meta.get("front_image_url", ""): # front_image_url = response.meta.get("front_image_url", "") # if not re.match('^https:.*', front_image_url): # front_image_url = 'https:' + front_image_url # article_item['front_image_url'] = [front_image_url] # else: # article_item['front_image_url'] = [] # 注意:使用绝对路径!直接加到域名后面! # 这是一个同步请求,后续的代码会被 block # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) # json_data = json.loads(html.text) # Use ItemLoader item_loader = ArticleItemLoader(item=CnblogsArticleItem(), response=response) item_loader.add_xpath('title', "//*[@id='news_title']//a/text()") item_loader.add_xpath('content', "//*[@id='news_content']") item_loader.add_xpath('tags', "//*[@class='news_tags']//a/text()") item_loader.add_xpath('create_date', "//*[@id='news_info']//*[@class='time']/text()") item_loader.add_value('url', response.url) item_loader.add_value('front_image_url', response.meta.get('front_image_url', '')) # article_item = item_loader.load_item() # 使用异步请求 yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"item_loader": item_loader, "url_object_id": get_md5(response.url)}, callback=self.parse_nums)
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) ''' 被itemloader优化的代码 article_item = CdnBlogArtcleItem() # title = response.css('div#news_title a::text').extract_first("") # create_date = response.css('div#news_info span.time::text').extract_first("") # content = response.css('div#news_content').extract_first("") # tag_list = response.css('div.news_tags a::text').extract() title = response.xpath('//*[@id="news_title"]//a/text()').extract_first("") create_date = response.xpath('//div[@id="news_info"]//span[@class="time"]/text()').extract_first("") match_re = re.match(".*?(\d+.*)", create_date) if match_re: create_date = match_re.group(1) content = response.xpath('//div[@id="news_content"]').extract_first("") tag_list = response.xpath('//div[@class="news_tags"]//a/text()').extract() tags = ",".join(tag_list) article_item["title"] = title article_item["create_date"] = create_date article_item["url"] = response.url article_item["content"] = content article_item["tags"] = tags image_url = response.meta.get("front_image_url", "") if image_url: article_item["front_image_url"] = [image_url] else: article_item["front_image_url"] = [] ''' item_loader = ArticleItemLoader(item=CdnBlogArtcleItem(), response=response) item_loader.add_xpath("title", '//*[@id="news_title"]//a/text()') item_loader.add_xpath("create_date", '//div[@id="news_info"]//span[@class="time"]/text()') item_loader.add_xpath("content", '//div[@id="news_content"]') item_loader.add_xpath("tags", '//div[@class="news_tags"]//a/text()') item_loader.add_value("url", response.url) if response.meta.get("front_image_url", []): item_loader.add_value("front_image_url", response.meta.get("front_image_url", [])) # 同步請请求和异步请求 # html = requests.get("https://news.cnblogs.com/NewsAjax/GetAjaxNewsInfo?contentId=654012") # html = requests.get(parse.urljoin(response.url,"/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) # j_data = json.load(html.text) # '{"ContentID":654012,"CommentCount":0,"TotalView":31,"DiggCount":0,"BuryCount":0} # yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),meta={"article_item", article_item}, callback=self.parse_nums) yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"article_item": item_loader, "url": response.url}, callback=self.parse_nums)
def parse_list(self, response): # 实例化项目 post_url = response.xpath('//*[@id="fmimg"]/img/@src').extract_first( "") # 图片 front_image_url = parse.urljoin(response.url, post_url) list = response.xpath('//*[@id="list"]/dl/dd/a/@href').extract() # 列表 # 通过自定义ArticleItemLoader加载item item_loader = ArticleItemLoader(item=BiQuGeListItem(), response=response) # xpath 获取方式 item_loader.add_value("url_object_id", get_md5(response.url)) # MD5ID item_loader.add_xpath("title", '//*[@id="info"]/h1/text()') # 文章标题 item_loader.add_xpath("author", '//*[@id="info"]/p[1]/text()') # 作者 item_loader.add_xpath("last_update_time", '//*[@id="info"]/p[3]/text()') # 最后更新时间 item_loader.add_value("front_image_url", [front_image_url]) # 图片下载链接 article_item = item_loader.load_item() yield article_item # 循环爬取详情页 if list: post_urls = [] for each in list: # post_urls.append(parse.urljoin(response.url, each)) yield Request( url=parse.urljoin(response.url, each), meta={"url_object_id": article_item['url_object_id']}, callback=self.parse_details) pass
def parse_detail(self, response): # xpath 选取字段 # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first() # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].replace('·', '').strip() # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # # vote_post_up = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]) # # bookmark_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first() # if bookmark_nums: # match_re = re.match(".*(\d+).*", bookmark_nums) # if match_re: # bookmark = int(match_re.group(1)) # # comment_nums = response.xpath("//a[@href='#article-comment']/text()").extract_first() # if comment_nums: # match_re = re.match(".*(\d+).*", comment_nums) # if match_re: # comment = int(match_re.group(1)) # css 选取字段 # article_item = JobBoleArticleItem() # article_item['title'] = title # article_item['create_date'] = create_date # article_item['front_image_url'] = [response.meta['front_image_url']] # article_item['url'] = response.url # article_item['url_object_id'] = get_md5(response.url) #通过 ItemLoader 加载 Item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()") item_loader.add_xpath( "create_date", "//p[@class='entry-meta-hide-on-mobile']/text()") # item_loader.add_xpath("vote_post_up", "//span[contains(@class, 'vote-post-up')]/h10/text()") # item_loader.add_xpath("bookmark_nums", "//span[contains(@class, 'bookmark-btn')]/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_css("content", "div.entry") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", response.meta.get("front_image_url", "")) article_item = item_loader.load_item() yield article_item
def detail_parse(self, response): # item = ArticleItem() # item['url'] = response.url # item['url_object_id'] = get_md5(response.url) # item['title'] = response.xpath('//div[@class="main_left"]//h2/text()').extract_first() # 新闻标题 # create_time = response.xpath('//div[@class="meta"]/span/text()').extract_first() # 发布时间 # create_time = create_time.split(' ')[0] # try: # create_time = datetime.datetime.strptime(create_time, '%Y-%m-%d').date() # except Exception as e: # create_time = datetime.datetime.now().date() # item['create_time'] = create_time # item['content'] = response.xpath('//div[@class="wen_article"]').extract_first() # 文章正文 # 通过item loader 加载item item_loader = ArticleItemLoader(item=ArticleItem(), response=response) item_loader.add_xpath('title', '//div[@class="main_left"]//h2/text()') item_loader.add_xpath('content', '//div[@class="wen_article"]') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_xpath('create_time', '//div[@class="meta"]/span/text()') article_loader = item_loader.load_item() yield article_loader
def parse_detail(self, response): # article_item = JobBleArticleItem() #提取文章的具体字段 # front_image_url = response.meta.get("front_image_url","") # title = response.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/h1/text()').extract()[0] # create_date = response.xpath('/html/body/div[1]/div[3]/div[1]/div[2]/p/text()').extract()[0].strip().replace("·","").strip() # praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract()[0] # fav_nums = response.css('.bookmark-btn::text').extract()[0] # match_re = re.match(".*?(\d+).*",fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css(".btn-bluet-bigger.href-style.hide-on-480::text").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.xpath("//div[@class='entry']").extract()[0] # tag_list =response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d") # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content #通过itemloader加载 front_image_url = response.meta.get("front_image_url", "") itemloader = ArticleItemLoader(item=JobBleArticleItem(), response=response) itemloader.add_xpath( "title", '/html/body/div[1]/div[3]/div[1]/div[1]/h1/text()') itemloader.add_value("url", response.url) itemloader.add_value('url_object_id', get_md5(response.url)) itemloader.add_xpath( "create_date", '/html/body/div[1]/div[3]/div[1]/div[2]/p/text()') itemloader.add_value("front_image_url", [front_image_url]) itemloader.add_xpath( 'praise_nums', '//span[contains(@class, "vote-post-up")]/h10/text()') itemloader.add_css('comment_nums', ".btn-bluet-bigger.href-style.hide-on-480::text") itemloader.add_css('fav_nums', '.bookmark-btn::text') itemloader.add_xpath( 'tags', '//p[@class="entry-meta-hide-on-mobile"]/a/text()') itemloader.add_xpath('content', "//div[@class='entry']") article_item = itemloader.load_item() yield article_item
def parse_detail(self, response): # article_item = JobBoleArticleItem() front_image_url = response.meta.get('front_image_url', '') # 文章封面图的url # # 提取文章的具体字段 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace( # '·', '').strip() # praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first() # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract_first() # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.xpath("//a[@href='#article-comment']/text()").extract_first() # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.xpath("//div[@class='entry']/p/text()").extract() # content = ''.join(content) # # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [tag for tag in tag_list if not tag.strip().endswith('评论')] # tags = ','.join(tag_list) # article_item['url_object_id'] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date,'%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["tags"] = tags # article_item["fav_nums"] = fav_nums # article_item["content"] = content #通过Itenmloader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) #实例化(具体化哪个需要loader) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_xpath( 'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()') item_loader.add_value('url', response.url) #直接传值给url item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_xpath('comment_nums', "//a[@href='#article-comment']/text()") item_loader.add_xpath( 'fav_nums', "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath( 'tags', "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loader.add_xpath('content', "//div[@class='entry']/p/text()") article_item = item_loader.load_item() #解析到item中去 yield article_item
def parse_page(self, response): """ 解析文章详情页,提取标题、创建时间、点赞数、收藏数、评论数以及文章标签 """ # 通过 itemloader 加载item front_image_url = parse.urljoin(response.url, response.meta.get("front_image_url", "")) item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath("title", "/html/head/title/text()") item_loader.add_xpath("create_date", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value("url", response.url) item_loader.add_value("url_id_md5", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath("praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()") item_loader.add_xpath("fav_nums", "//span[@class=' btn-bluet-bigger href-style bookmark-btn register-user-only ']/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span") item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loader.add_xpath("content", "//div[@class='entry']") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # 提取文章的具体字段 # url = 'http://blog.jobbole.com/110287/' # 标题,创建日期,点赞数,喜欢数,评论数,正文 的提取 front_image_url = response.meta.get("front_image_url", "") title = response.xpath( '//*[@class="entry-header"]/h1/text()').extract_first() # 文章封面图 create_data = response.xpath( "//*[@class='entry-meta']/p/text()").extract()[0].split()[0] praise_nums = response.xpath( "//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0] fav_nums = response.xpath( "//span[contains(@class,'bookmark-btn')]/text()").extract_first( ).split()[0].split(' ')[0] fav_re = re.match(".*?(\d+).*", fav_nums) # if fav_re: # fav_nums = int(fav_re.group(1)) # else: # fav_nums = 0 comment_nums = response.xpath( "//a[@href='#article-comment']/span/text()").extract()[0] comment_re = re.match(".*?(\d+).*", comment_nums) # if comment_re: # # comment_nums = int(comment_re.group(1)) # else: # comment_nums = 0 content = response.xpath("//div[@class='entry']").extract()[0] type_tag = response.xpath( "//*[@class='entry-meta']/p/a/text()").extract()[0].split()[0] tag_list = response.css( "p.entry-meta-hide-on-mobile a::text").extract() tag_list = [ element for element in tag_list if not element.strip().endswith("评论") ] tags = ",".join(tag_list) author_type = response.xpath("//div[@class='copyright-area']/text()" ).extract()[0] # 本文作者: ,原文出处: ; # if author_type == '本文作者: ': # author = response.xpath("//div[@class='copyright-area']/a/text()").extract()[1] # elif author_type == '原文出处: ': # author = response.xpath("//div[@class='copyright-area']/a/text()").extract()[0] # tags = type_tag + ',' + author article_item["url_object_id"] = get_md5(response.url) article_item['title'] = title article_item['url'] = response.url try: create_data = datetime.datetime.strptime(create_data, "%Y/%m/%d").date() except Exception as e: create_data = datetime.datetime.now().date() article_item['create_date'] = create_data article_item['front_image_url'] = [front_image_url] article_item['praise_nums'] = praise_nums article_item['comment_nums'] = comment_nums article_item['fav_nums'] = fav_nums article_item['tags'] = tags article_item['content'] = content # 通过 item Loader 可以将css xpath 维护工作变的简单 item loader是个容器 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # item_loader.add_css("title"," ") item_loader.add_xpath("title", "//*[@class='entry-header']/h1/text()") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_xpath("create_date", "//*[@class='entry-meta']/p/text()") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath( "praise_nums", "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath( "fav_nums", "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath("tags", "//*[@class='entry-meta']/p/a/text()") item_loader.add_xpath("content", "//div[@class='entry']") article_item = item_loader.load_item() # 结果为list 还需要对数据做筛选,去掉冗余数据 yield article_item # 提交 article_item
def parse_detail(self, response): # re_selector = response.xpath('//*[@id="post-113771"]/div[1]/h1/text()') # article_item = JobBoleArticleItem() # # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # page_url = response.meta.get("page_url", "") # 文章所在页面的url # # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace("·", # "").strip() # # praise_nums = response.xpath( # '//span[@class=" btn-bluet-bigger href-style vote-post-up register-user-only "]/h10/text()').extract()[ # 0].strip() # fav_nums = response.xpath( # '//span[@class=" btn-bluet-bigger href-style bookmark-btn register-user-only "]/text()').extract()[0] # # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # else: # fav_nums = 0 # # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0] # if match_re: # comment_nums = match_re.group(1) # else: # comment_nums = 0 # # content = response.xpath("//div[@class='entry']/text()").extract()[0] # # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # article_item["page_url"] = page_url # 通过Item Loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 page_url = response.meta.get("page_url", "") # 文章所在页面的url item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # item_loader.add_css() # item_loader.add_value() item_loader.add_xpath("title", '//div[@class="entry-header"]/h1/text()') item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_xpath("create_date", '//p[@class="entry-meta-hide-on-mobile"]/text()') item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("page_url", "-") item_loader.add_xpath("comment_nums", '//a[@href="#article-comment"]/span/text()') item_loader.add_xpath("praise_nums", '//span[@class=" btn-bluet-bigger href-style vote-post-up register-user-only "]/h10/text()') item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loader.add_xpath("content", '//div[@class="entry"]/text()') article_item = item_loader.load_item() yield article_item
def parse_zl(self, response): LaGouArticleItem = ArticleItemLoader(item=ZhiLianItem(), response=response) LaGouArticleItem.add_css("job_name", '.fixed-inner-box h1::text') LaGouArticleItem.add_xpath("salary", "//div[@class='terminalpage-left']/ul/li[1]/strong/text()") LaGouArticleItem.add_xpath("job_exp", "//div[@class='terminalpage-left']/ul/li[5]/strong/text()") LaGouArticleItem.add_xpath("edu", "//div[@class='terminalpage-left']/ul/li[6]/strong/text()") LaGouArticleItem.add_xpath("job_type", "//div[@class='terminalpage-left']/ul/li[4]/strong/text()") LaGouArticleItem.add_xpath("work_city","//div[@class='terminalpage-left']/ul/li[2]/strong/a/text()") LaGouArticleItem.add_css("company_name",".inner-left a ::text") LaGouArticleItem.add_css("company_url",".inner-left a::attr(href)") LaGouArticleItem.add_css("work_addr",".terminalpage-main h2::text") #LaGouArticleItem.add_xpath("feedback","//div[@class='publisher_data']/div[2]/span[@class='tip']/i/text()") LaGouArticleItem.add_xpath("create_date","//div[@class='terminalpage-left']/ul/li[3]/strong") LaGouArticleItem.add_value("job_url", response.url) LaGouArticleItem.add_value("job_url_id",get_md5(response.url)) LaGouArticleItem.add_css("job_advantage", ".welfare-tab-box ::text") LaGouArticleItem.add_xpath("job_desc","//div[@class='tab-inner-cont'][1]/p") LaGouArticleItem.add_xpath("tag","//div[@class='terminalpage-left']/ul/li[8]/strong/a/text()") ArticleItemLoder = LaGouArticleItem.load_item() return ArticleItemLoder
def parse_detail(self, response): """ 提取文章信息 """ # 通过自定义的item_loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath("title", '//div[@class="entry-header"]/h1/text()') front_image_url = response.meta.get("front_image_url", "") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_xpath( "create_date", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath( "praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath( "fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()") item_loader.add_xpath( "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loader.add_xpath("content", "//div[@class='entry']") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # 使用item loader加载item front_image_url = response.meta.get('front_image_url', '') # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath('title', '//*[@class="entry-header"]/h1/text()') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_xpath('create_date', '//*[@class="entry-meta"]/p/text()[1]') item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_xpath('praise_nums', '//h10/text()') item_loader.add_xpath('collect_nums', '//*[@class="post-adds"]/span[2]/text()') item_loader.add_xpath('comment_nums', '//*[@class="post-adds"]/a[1]/span/text()') item_loader.add_xpath('content', '//*[@class="entry"]') item_loader.add_xpath('tags', '//div[@class="entry-meta"]/p/a/text()') article_item = item_loader.load_item() yield article_item
def parse_detail(self,response): article_item = JobBoleArticleItem() #提取文章的具体字段 # title = response.xpath("// *[ @ id = 'post-113652'] / div[1] / h1/text()").extract()[0] # 使用css样式选择器 # front_image_url = response.meta.get("front_image_url","")#文章封面图 # title = response.css(".entry-header h1::text").extract_first() # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().strip().replace("·","").strip() # #praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first() # praise_nums = response.css(".vote-post-up h10::text").extract_first() # # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0].replace("收藏",'').strip() # #使用正则表达式 # fav_nums = response.css(".bookmark-btn::text").extract_first() # match_re = re.match(".*?(\d+).*",fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first().replace("评论","").strip() # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first() # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.xpath("//div[@class='entry']").extract_first() # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = list(set(tag_list)) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # try: # create_date = datetime.datetime.strptime(create_date,"%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["url"] = response.url # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content #通过Item loader 加载Item(利于后期维护) # 1.默认都是生成list # 2.增加处理函数 front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(),response=response) item_loader.add_css("title",".entry-header h1::text") item_loader.add_value("url",response.url) item_loader.add_value("url_object_id",get_md5(response.url)) item_loader.add_xpath("create_date","//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value("front_image_url",[front_image_url]) item_loader.add_css("praise_nums",".vote-post-up h10::text") item_loader.add_css("comment_nums","a[href='#article-comment'] span::text") item_loader.add_css("fav_nums",".bookmark-btn::text") item_loader.add_xpath("tags","//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loader.add_xpath("content","//div[@class='entry']") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() front_image_url = response.meta.get("front_image_url", "") # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") # create_date = response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace( # " ·", "") # praise_nums = response.xpath('//div/span[contains(@class,"vote-post-up")]/h10/text()').extract()[0] # fav_nums_string = response.xpath('//div/span[contains(@class,"bookmark-btn")]/text()').extract()[0] # match_re = re.match(r".*?(\d+).*", fav_nums_string) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums_string = response.xpath('//div/a[@href="#article-comment"]/span/text()').extract()[0] # match_re = re.match(r".*?(\d+).*", comment_nums_string) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.xpath('//div[@class="entry"]').extract() # # tag_list_comment = response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_list = [element for element in tag_list_comment if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # # try: # create_date = datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.now().date() # # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["fav_nums"] = fav_nums # article_item["comment_nums"] = comment_nums # article_item["tags"] = tags # article_item["content"] = content item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath("title", '//div[@class="entry-header"]/h1/text()') item_loader.add_xpath( "create_date", '//div/p[@class="entry-meta-hide-on-mobile"]/text()') item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", front_image_url) # item_loader.add_xpath("front_image_path",) item_loader.add_xpath( "praise_nums", '//div/span[contains(@class,"vote-post-up")]/h10/text()') item_loader.add_xpath( "fav_nums", '//div/span[contains(@class,"bookmark-btn")]/text()') item_loader.add_xpath("comment_nums", '//div/a[@href="#article-comment"]/span/text()') item_loader.add_xpath("content", '//div[@class="entry"]') item_loader.add_xpath( "tags", '//div/p[@class="entry-meta-hide-on-mobile"]/a/text()') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobbboleItem() if (response.url == 'http://blog.jobbole.com/all-posts/'): pass # url = response.url # title = response.xpath('//*[@class="entry-header"]/h1/text()').extract()[0] # create_date = response.xpath('//*[@class="entry-meta"]/p/text()').extract()[0].strip().replace("·","").strip() # praise_nums = int(response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]) # book_mark = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0] # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(r".*(\d+).*",book_mark) # match_com = re.match(r".*(\d).*",comment_nums) # content = response.xpath("//*[@class='entry']/p/text()").extract()[0] # if match_re: # book_mark = int(match_re.group(1)) # else: # book_mark = 0 # if match_com: # comment_nums = int(match_com.group(1)) # else: # comment_nums = 0 # article_item['title'] = title # """ # 将字符串日期格式化为date类型 # """ # try: # create_date = datetime.datetime.strftime(create_date,"%Y/%m/%d").date() # except Exception as er: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # article_item['praise_nums'] = praise_nums # article_item['book_mark'] = book_mark # article_item['comment_nums'] = comment_nums # article_item['content'] = content # article_item['url'] = url #通过itemLoader加载 item item_loader = ArticleItemLoader(item=JobbboleItem(), response=response) item_loader.add_xpath("title", '//*[@class="entry-header"]/h1/text()') item_loader.add_value("url", response.url) item_loader.add_xpath("create_date", '//*[@class="entry-meta"]/p/text()') item_loader.add_xpath( "praise_nums", "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath( "book_mark", "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath("content", "//*[@class='entry']/p/text()") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() #提取文章的具体字段 front_image_url = response.meta.get('front_image_url', '') # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace("·", "").strip() # praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract()[0] # # fav_nums = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract()[0] # match_re = re.match('.*?(\d+).*', fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0] # match_re = re.match('.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.xpath('//div[@class="entry"]').extract() # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content #通过item_loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_xpath( 'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()') item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_xpath( 'praise_nums', '//span[contains(@class, "vote-post-up")]/h10/text()') item_loader.add_xpath('comment_nums', '//a[@href="#article-comment"]/span/text()') item_loader.add_xpath( 'fav_nums', '//span[contains(@class, "bookmark-btn")]/text()') item_loader.add_xpath( 'tags', '//p[@class="entry-meta-hide-on-mobile"]/a/text()') item_loader.add_xpath('content', '//div[@class="entry"]') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): """ 爬取数据 """ # front_image_url = response.meta.get("font_image_url", "") # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first("") # 标题 # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "").strip() # 日期 # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # 点赞数 # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # 收藏人数 # match_re = re.match(".*?(\d+).*", fav_nums) # # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # 评论人数 # match_re = re.match(".*?(\d+).*]", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.xpath("//div[@class='entry']").extract()[0] # 日期 # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list= [element for element in tag_list if not element.strip().endswith("评论")] # tags = ','.join(tag_list) # 标签 # # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.strftime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.now().date() # # article_item["url_object_id"] = get_md5(response.url) # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content article_item = JobboleArticleItem() # 通过itemload加载item front_image_url = response.meta.get("font_image_url", "") item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_xpath("create_date", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath("praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath("fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()") item_loader.add_xpath("content", "//div[@class='entry']") item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") article_item= item_loader.load_item() yield article_item
def parse_detail(self, response): # 提取文章的具体字段 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first() # # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().replace("·", "").strip() # try: # create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d') # except Exception as e: # create_date = datetime.datetime.now() # # praise_nums = int(response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first()) # # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract_first() # match_re = re.match(r".*(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first() # match_re = re.match(r".*(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # # content = response.xpath("//div[@class='entry']").extract_first() # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item = JobboleArticleItem() # article_item['url_object_id'] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # 通过ItemLoader加载item front_image_url = response.meta.get("front_image_url", "") #文章封面图 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_xpath('title', "//div[@class='entry-header']/h1/text()") item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_xpath( 'create_date', "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_xpath( 'praise_nums', "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath('comment_nums', "//a[@href='#article-comment']/span/text()") item_loader.add_xpath( 'fav_nums', "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath( 'tags', "//p[@class='entry-meta-hide-on-mobile']/a/text()") # item_loader.add_xpath('content', "//div[@class='entry']") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): image_url = response.meta.get("image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath("title", "//h1/text()") item_loader.add_xpath( "publish_time", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("image_url", [image_url]) item_loader.add_xpath( "zan_nums", "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath("collect_nums", "//span[@data-site-id='2']/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath("content", "//div[@class='entry']") item_loader.add_xpath( "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") article = item_loader.load_item() yield article
def parse_detail(self, response): # article_item = JobBoleArticleItem() #提取文章的具体字段 # front_image_url = response.meta.get("front-img-url", "") #文章封面图url # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first(default="") # creat_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().strip().replace("·","").strip() # praise_num = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract_first() # collect_num = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first() # match_re = re.match(".*?(\d+).*", collect_num) # if match_re: # collect_num = int(match_re.group(1)) # else: # collect_num = 0 # # comment_num = response.css(".btn-bluet-bigger.href-style.hide-on-480::text").extract_first() # match_re = re.match(".*?(\d+).*", comment_num) # if match_re: # comment_num = int(match_re.group(1)) # else: # comment_num = 0 # # content = response.xpath("//div[@class='entry']").extract_first() # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # article_item["front_image_url"] = [front_image_url] # try: # creat_date = datetime.strptime(creat_date, "%Y/%m/%d").date() # except Exception as e: # creat_date = datetime.now().date() # article_item["creat_date"] = creat_date # article_item["praise_num"] = praise_num # article_item["collect_num"] = collect_num # article_item["comment_num"] = comment_num # article_item["content"] = content # article_item["tags"] = tags # 通过ItemLoder加载item front_image_url = response.meta.get("front-img-url", "") # 文章封面图url item_loder = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loder.add_xpath("title", "//div[@class='entry-header']/h1/text()") item_loder.add_xpath("creat_date", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loder.add_xpath( "praise_num", "//span[contains(@class, 'vote-post-up')]/h10/text()") item_loder.add_xpath( "collect_num", "//span[contains(@class, 'bookmark-btn')]/text()") item_loder.add_css("comment_num", ".btn-bluet-bigger.href-style.hide-on-480::text") item_loder.add_xpath("content", "//div[@class='entry']") item_loder.add_xpath( "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loder.add_value("url", response.url) item_loder.add_value("url_object_id", get_md5(response.url)) item_loder.add_value("front_image_url", [front_image_url]) article_item = item_loder.load_item() #传到pipeline yield article_item
def parse_detail(self, response): # Request的回调函数 # 提取文章的具体字段 # article_item = JobBoleArticleItem() # front_image_url = response.meta.get("front_image_url", "") # 获取文章封面图,用get方法不会抛异常 # title = response.xpath("//*[@class='entry-header']/h1/text()").extract_first("") # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first("").strip()[:10] # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract_first("") # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first("") # match_fav = re.match(".*?(\d+).*", fav_nums) # if match_fav: # fav_nums = int(match_fav.group(1)) # else: # fav_nums = 0 # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first("") # match_com = re.match(".*?(\d+).*", comment_nums) # if match_com: # comment_nums = int(match_com.group(1)) # else: # comment_nums = 0 # # content = response.xpath("//div[@class='entry']").extract_first("") # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # 将url进行md5编码保存 # article_item["title"] = title # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # 转换成日期 # except Exception as e: # create_date = datetime.datetime.now().date() # 获取当前如期 # article_item["create_date"] = create_date # article_item["url"] = response.url # article_item["front_image_url"] = [front_image_url] # 下载图片需要数组 # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["content"] = content # article_item["tags"] = tags # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 获取文章封面图,用get方法不会抛异常 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath("title", "//*[@class='entry-header']/h1/text()") item_loader.add_xpath("create_date", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value("url", response.url) item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("url_object_id", response.url) item_loader.add_xpath("praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath("fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()") item_loader.add_xpath("content", "//div[@class='entry']") item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") article_item = item_loader.load_item() # 传递到pipelines.py yield article_item