def parse_detail(self, response): article_item = JobboleArticleItem() # title = response.css('div.entry-header > h1::text').extract_first() # create_date = response.css('.entry-meta-hide-on-mobile::text').extract_first().replace('·','').strip() # praise_num = response.css('.post-adds .vote-post-up h10::text').extract_first(0) front_img_url = response.meta.get('front_img_url', '') # # fav_num_info = response.css('.post-adds .bookmark-btn::text').extract_first() # fav_num_re = re.match(".*(\d+).*", fav_num_info) # if fav_num_re: # fav_num = fav_num_re.group(1) # else: # fav_num = 0 # comment_num_info = response.css('a[href="#article-comment"] span::text').extract_first() # comment_num_re = re.findall("\d+",comment_num_info) # if comment_num_re: # comment_num = comment_num_re[0] # else: # comment_num = 0 # # tag_list = response.css('.entry-meta .entry-meta-hide-on-mobile a::text').extract() # tags = ','.join([tag for tag in set(tag_list) if not tag.strip().endswith('评论')]) # content = response.css('.entry').extract_first() # # article_item['url_object_id'] = get_md5(response.url) # article_item['url'] = response.url # article_item['title'] = title # try: # create_date = datetime.strptime(create_date,'%Y/%m/%d').date() # except Exception as e: # create_date = datetime.now() # article_item['create_date'] = create_date # article_item['praise_num'] = praise_num # article_item['fav_num'] = fav_num # article_item['comment_num'] = comment_num # article_item['front_img_url'] = [front_img_url] # article_item['tags'] = tags # article_item['content'] = content #通过item loader价值item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css('title', 'div.entry-header > h1::text') item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text') item_loader.add_css('praise_num', '.post-adds .vote-post-up h10::text') item_loader.add_css('fav_num', '.post-adds .bookmark-btn::text') #re item_loader.add_css('comment_num', 'a[href="#article-comment"] span::text') #re item_loader.add_css( 'tag', '.entry-meta .entry-meta-hide-on-mobile a::text') #处理函数 item_loader.add_css('content', '.entry') item_loader.add_value('front_img_url', [front_img_url]) item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) article_item = item_loader.load_item() yield article_item
def parse_content(self, response): # 通过css选择器提取数据 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.css('.entry-header h1::text').extract_first() # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first().replace("·","").strip() # praise_num = response.css('.vote-post-up h10::text').extract_first() #点赞数 # fav_num = response.css('.bookmark-btn::text').extract_first() #收藏数 # match_re = re.match(".*?(\d+).*", fav_num) # if match_re: # fav_num = int(match_re.group(1)) # else: # fav_num = 0 # comments_num = response.css('a[href="#article-comment"] span::text').extract_first() # 评论数 # match_re = re.match(".*?(\d+).*", comments_num) # 正则获取字符串中的数字 # if match_re: # comments_num = int(match_re.group(1)) # else: # comments_num = 0 # content = response.css('div.entry').extract_first() # 正文 # tag_selecter = response.css('p.entry-meta-hide-on-mobile a::text').extract() # tag_list = [element for element in tag_selecter if not element.strip().endswith('评论')] # tags = ",".join(tag_list) # 标签 # # article_item = JobboleArticleItem() # article_item["title"] = title # try: # create_date = datetime.strptime(create_date, '%Y/%m/%d').date() # except Exception as e: # create_date = datetime.now().date() # article_item["create_date"] = create_date # article_item["url"] = response.url # article_item["url_object_id"] = get_md5(response.url) # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_num # article_item["comment_nums"] = comments_num # article_item["fav_nums"] = fav_num # article_item["tags"] = tags # article_item["content"] = content # 通过item loader加载item 使用自定义的loader:ArticleItemLoader 由list变成str front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response = response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("create_date","p.entry-meta-hide-on-mobile::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_article(self, response): """ :param response: :return: """ job_article_instance = JobboleArticleItem() front_img_url = response.meta.get("front_img_url", None) #print(front_img_url) title = response.css(".entry-header h1::text").extract()[0] create_date = response.css( "p.entry-meta-hide-on-mobile::text").extract()[0].strip() like_num = response.css(".vote-post-up h10::text").extract()[0] record_num = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(".*?(\d+).*", record_num) if match_re: record_num = int(match_re.group(1)) else: record_num = 0 comment_num = response.css( "a[href='#article-comment'] span::text").extract()[0] match_re = re.match(".*?(\d+).*", comment_num) if match_re: comment_num = int(match_re.group(1)) else: comment_num = 0 content = response.css("div.entry").extract()[0] tags = response.css("p.entry-meta-hide-on-mobile a::text").extract() tags = "".join(tags) job_article_instance["title"] = title job_article_instance["create_date"] = create_date job_article_instance["url"] = response.url job_article_instance["front_img_url"] = front_img_url job_article_instance["like_num"] = like_num job_article_instance["record_num"] = record_num job_article_instance["comment_num"] = comment_num job_article_instance["tags"] = tags job_article_instance["content"] = content yield job_article_instance
def parse_detail(self, response): #提取文章具体字段(xpath) # title = response.xpath('//*[@id="post-113789"]/div[1]/h1/text()').extract()[0] # # create_date = response.xpath('//*[@id="post-113789"]/div[2]/p/text()[1]').extract()[0].strip().replace('·', '').strip() # # praise_nums = response.xpath('//*[@id="113789votetotal"]/text()').extract() # if praise_nums: # praise_nums = int(praise_nums[0]) # else: # praise_nums = 0 # # fav_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/span[2]/text()').extract()[0] # match_re = re.match(r'.*?(\d+).*', fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/a/span/text()').extract()[0] # match_re = re.match(r'.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.xpath('//*[@id="post-113789"]/div[3]').extract()[0] # # tag_list = response.xpath('//*[@id="post-113789"]/div[2]/p/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) #以下通过css选择器提取字段 # article_item = JobboleArticleItem() #实例化 # # front_image_url = response.meta.get('front_image_url', '') #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空) # #文章封面图 # # title = response.css('.entry-header h1::text').extract()[0] # # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·', '').strip() # # praise_nums = response.css('.vote-post-up h10::text').extract_first() # if praise_nums: # praise_nums = int(praise_nums[0]) # else: # praise_nums = 0 # # fav_nums = response.css('.bookmark-btn::text').extract()[0] # match_re = re.match(r'.*?(\d+).*', fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(r'.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) # # #填充值到items # article_item['title'] = title # article_item['url'] = response.url # article_item['url_object_id'] = get_md5(response.url) #对url做MD5 # # try: #为了将文章的创建时间写入数据库,要把str类型的create_time转换为date类型 # create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() #将格式为%Y/%m/%d 的str类型转换为date类型 # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # # article_item['front_image_url'] = [front_image_url] #images需要接受一个数组 # article_item['praise_nums'] = praise_nums # article_item['fav_nums'] = fav_nums # article_item['comment_nums'] = comment_nums # article_item['tags'] = tags # article_item['content'] = content #通过itemLoader加载item front_image_url = response.meta.get('front_image_url', '') #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空) #item_loader = ItemLoader(item=JobboleArticleItem(), response=response) #定义ItemLoader实例 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) #改用自定义的 ItemLoader # ItemLoader.add_css(self, field_name, css) # ItemLoader.add_xpath(self, field_name, xpath) # ItemLoader._add_value(self, field_name, value) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() #调用默认的item方法的话会有两个问题:1.值都是list 2.还需要对取出的值行进处理(做re的提取等) #-->去修改items.py #1.在items.py 的Field()里面用TakeFirst进行处理 2.在items.py 的Field()里面用MapCompose进行处理 yield article_item #调用yield之后,item会传递到pipelines.py
def parse_detail(self, response): """ 1、解析下载的详情页源码,从中提取数据 :param response: :return: """ # 利用基础的item实现的item # url = response.url # url_object_id = get_md5(response.url) # front_image_url = response.meta.get('front_image_url', '') # title = response.css('.entry-header h1::text').extract_first('') # create_data = response.css('.entry-meta-hide-on-mobile::text').re('.*?((\d{4})/(\d{1,2})/(\d{1,2})).*')[0] # # tags_list = response.css('.entry-meta-hide-on-mobile a::text').extract() # tags = '·'.join([tag for tag in tags_list if "评论" not in tag]) # # thumbs_up = int(response.css('.vote-post-up h10::text').extract_first(0)) # # collected = response.css('.bookmark-btn::text').re('(\d+)') # collected = collected[0] if collected else 0 # # comments = response.css('.post-adds a span::text').re('.*?(\d+).*') # comments = comments[0] if comments else 0 # # content = response.css('.entry').extract_first('') # # JobboleItem = JobboleArticleItem() # # JobboleItem['url'] = url # JobboleItem['url_object_id'] = url_object_id # JobboleItem['front_image_url'] = [front_image_url] # 用scrapy自带的imagepipeline下载图片时,是循环获取图片链接。所以这里必须是可循环的对象 # JobboleItem['title'] = title # JobboleItem['create_data'] = create_data # JobboleItem['tags'] = tags # JobboleItem['thumbs_up'] = thumbs_up # JobboleItem['collected'] = collected # JobboleItem['comments'] = comments # JobboleItem['content'] = content # # yield JobboleItem # 使用 Item Loader 加载item item_loader = JobboleArticleItemLoader( item=JobboleArticleItem(), response=response) # 生成一个item loader 对象 # 常用的添加规则的方法: add_css, add_xpath 直接添加值的方法: add_value item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('front_image_url', response.meta.get('front_image_url', '')) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_css('create_data', '.entry-meta-hide-on-mobile::text') item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text') item_loader.add_css('thumbs_up', '.vote-post-up h10::text') item_loader.add_css('collected', '.bookmark-btn::text') item_loader.add_css('comments', '.post-adds a span::text') item_loader.add_css('content', '.entry') # 在添加完规则之后,要调用一下item_loader的load_item方法 article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): #提取文章具体字段 # title = response.xpath('//*[@id="post-113789"]/div[1]/h1/text()').extract()[0] # # create_date = response.xpath('//*[@id="post-113789"]/div[2]/p/text()[1]').extract()[0].strip().replace('·', '').strip() # # praise_nums = response.xpath('//*[@id="113789votetotal"]/text()').extract() # if praise_nums: # praise_nums = int(praise_nums[0]) # else: # praise_nums = 0 # # fav_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/span[2]/text()').extract()[0] # match_re = re.match(r'.*?(\d+).*', fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/a/span/text()').extract()[0] # match_re = re.match(r'.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.xpath('//*[@id="post-113789"]/div[3]').extract()[0] # # tag_list = response.xpath('//*[@id="post-113789"]/div[2]/p/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) #以下通过css选择器提取字段 article_item = JobboleArticleItem() #实例化 front_image_url = response.meta.get( 'front_image_url', '') #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空) #文章封面图 title = response.css('.entry-header h1::text').extract()[0] create_date = response.css( 'p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace( '·', '').strip() praise_nums = response.css('.vote-post-up h10::text').extract_first() if praise_nums: praise_nums = int(praise_nums[0]) else: praise_nums = 0 fav_nums = response.css('.bookmark-btn::text').extract()[0] match_re = re.match(r'.*?(\d+).*', fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css( "a[href='#article-comment'] span::text").extract()[0] match_re = re.match(r'.*?(\d+).*', comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] tag_list = response.css( "p.entry-meta-hide-on-mobile a::text").extract() tag_list = [ element for element in tag_list if not element.strip().endswith('评论') ] tags = ','.join(tag_list) #填充值到items article_item['title'] = title article_item['url'] = response.url article_item['url_object_id'] = get_md5(response.url) #对url做MD5 article_item['create_date'] = create_date article_item['front_image_url'] = [front_image_url] #images需要接受一个数组 article_item['praise_nums'] = praise_nums article_item['fav_nums'] = fav_nums article_item['comment_nums'] = comment_nums article_item['tags'] = tags article_item['content'] = content yield article_item #调用yield之后,item会传递到pipelines.py pass
def parse_detail(self, response): # 提取文章的具体字段 article_item = JobboleArticleItem() # 图片 image = response.meta.get("front_img", "") # # # 标题 # title = response.xpath("//div[@class='entry-header']/h1/text()") # title_result = title.extract_first("") # # 创建时间 # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()") # create_date_result = create_date.extract()[0].replace("·", "").strip() # # 点赞数 # praise_num = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()") # praise_num_result = int(praise_num.extract()[0]) # # 收藏数 # fav_num = response.xpath("//span[contains(@class,'bookmark-btn')]/text()") # match_re = re.match(r".*?(\d+).*", fav_num.extract()[0]) # if match_re: # fav_num = int(match_re.group(1)) # else: # fav_num = 0 # # # 评论数 # comment_num = response.xpath("//a[@href='#article-comment']/span/text()") # match_re = re.match(".*?(\d+).*", comment_num.extract()[0]) # if match_re: # comment_num = int(match_re.group(1)) # else: # comment_num = 0 # # 文章内容 # # article_content=response.xpath("//") # content = response.xpath("//div[@class='entry']//text()").extract() # content_data = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # content_key = [content_key for content_key in content_data if not content_key.strip().endswith("评论")] # content_keys = ','.join(content_key) # # article_item['url'] = response.url # article_item['url_object_id'] = get_md5(response.url) # article_item['front_img_url'] = [image] # article_item['title'] = title_result # try: # create_date_result = datetime.datetime.strptime(create_date_result, '%Y/%m/%d').date() # except Exception as e: # create_date_result = datetime.datetime.now() # article_item['create_time'] = create_date_result # article_item['praise_num'] = praise_num_result # article_item['fav_num'] = fav_num # article_item['comment_num'] = comment_num # article_item['content'] = content # article_item['tags'] = content_keys # 通过item loader 加载item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_value('url', response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_img_url", [image]) item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()") item_loader.add_xpath("create_time", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_xpath("praise_num", "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath("fav_num", "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath("comment_num", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath("content", "//div[@class='entry']//text()") item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") article_item= item_loader.load_item() yield article_item