def parse_detail(self, response): article_item = JobboleArticleItem() #通过css选择器提取数据 front_image_url = response.meta.get('front_image_url', "") # 获取文章封面图 #通过ItemLoader加载Item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css("title", '.entry-header>h1::text') item_loader.add_value("url", response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', [front_image_url]) if response.css('.vote-post-up>h10::text'): item_loader.add_css("praise_number", '.vote-post-up>h10::text') else: item_loader.add_value("praise_number", "0") item_loader.add_css("comment_nums", 'a[href="#article-comment"]>span::text') item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile>a::text') item_loader.add_css('content', 'div.entry') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): """通过css选择器提取页面内容""" article_item = JobboleArticleItem() # title = response.css(".entry-header h1::text").extract()[0] # front_img_url = response.meta.get("font_img_url", "") # create_date = response.css(".entry-meta p::text").extract()[0].replace("·", "").strip() # praise_nums = response.css(".post-adds span h10::text").extract()[0] # fav_nums = response.css(".bookmark-btn::text").extract()[0].strip() # match_re = re.match(".*?(\d+).*?", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*?(\d+).*?", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css(".entry").extract()[0] # tag_list = response.css(".entry-meta a::text").extract() # tags = ','.join(tag_list) # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d") # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['title'] = title # article_item['url'] = response.url # article_item['url_object_id'] = get_md5(response.url) # article_item['front_img_url'] = [front_img_url] # article_item['create_date'] = create_date # article_item['praise_nums'] = praise_nums # article_item['fav_nums'] = fav_nums # article_item['comment_nums'] = comment_nums # article_item['content'] = content # article_item['tags'] = tags # 通过 itemloader 加载item front_img_url = response.meta.get("font_img_url", "") item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) # item_loader = ItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_css('create_date', '.entry-meta p::text') item_loader.add_css('praise_nums', '.post-adds span h10::text') item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('comment_nums', 'a[href="#article-comment"] span::text') item_loader.add_css('tags', '.entry-meta p a::text') item_loader.add_css('content', '.entry') # item_loader.add_xpath() item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('front_img_url', [front_img_url]) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): """ 爬取数据 """ # front_image_url = response.meta.get("font_image_url", "") # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first("") # 标题 # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "").strip() # 日期 # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # 点赞数 # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # 收藏人数 # match_re = re.match(".*?(\d+).*", fav_nums) # # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # 评论人数 # match_re = re.match(".*?(\d+).*]", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.xpath("//div[@class='entry']").extract()[0] # 日期 # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list= [element for element in tag_list if not element.strip().endswith("评论")] # tags = ','.join(tag_list) # 标签 # # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.strftime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.now().date() # # article_item["url_object_id"] = get_md5(response.url) # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content article_item = JobboleArticleItem() # 通过itemload加载item front_image_url = response.meta.get("font_image_url", "") item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_xpath("create_date", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath("praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath("fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()") item_loader.add_xpath("content", "//div[@class='entry']") item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") article_item= item_loader.load_item() yield article_item
def parse_detail(self, response): # 提取文章具体字段 # xpath选择器 # //*[@id="post-114228"]/div[1]/h1 # res_selector = response.xpath('//*[@id="post-114228"]/div[1]/h1/text()') # selector_data = res_selector.extract() article_title = response.xpath( "//div[@class='entry-header']/h1/text()").extract()[0] publish_time = response.xpath( "//p[@class='entry-meta-hide-on-mobile']/text()").extract( )[0].strip().replace("·", "").strip() tag_list = response.xpath( "//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() article_content = response.xaaapath( "//div[@class='entry']").extract_first() star_num = response.xpath( "//div[@class='post-adds']//h10/text()").extract_first() bookmark_data = response.xpath( "//span[contains(@class, 'bookmark-btn')]/text()").extract_first() bookmark_re = re.match(".*?(\d+).*", bookmark_data) if bookmark_re: bookmark_num = int(bookmark_re.group(1)) else: bookmark_num = 0 comment_data = response.xpath( "//a[@href='#article-comment']/span/text()").extract()[0] comment_re = re.match(".*?(\d+).*", comment_data) if comment_re: comment_num = int(comment_re.group(1)) else: comment_num = 0 # 添加到数据关系映射当中 article_item = JobboleArticleItem()
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: # 正则表达式中的第一项符合要求的 post_id = match_re.group(1) article_item = JobboleArticleItem() title = response.css("#news_title a::text").extract_first("") create_date = response.css("#news_info .time::text").extract_first( "") match_re = re.match(".*?(\d+.*)", create_date) if match_re: create_date = match_re.group(1) content = response.css("#news_content").extract()[0] tag_list = response.css(".news_tags a::text").extract() tags = ",".join(tag_list) article_item["title"] = title article_item["create_date"] = create_date article_item["content"] = content article_item["tags"] = tags article_item["url"] = response.url # 传递给下载的url一定要是以列表的形式 if response.meta.get("front_image_url", " "): article_item["front_image_url"] = [ response.meta.get("front_image_url", "") ] else: article_item["front_image_url"] = [] # 将article_item作为meta传递给parse_news_info方法 yield Request(url=parse.urljoin( response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"article_item": article_item}, callback=self.parse_news_info)
def parse_detail(self, response): article_item = JobboleArticleItem() remote_img_url = response.meta.get('remote_img_url', '') title = response.css(".entry-header h1::text").extract()[0] publish_date = response.css('.entry-meta-hide-on-mobile::text').extract_first('')[0:-3].strip() praise_nums = response.css('.vote-post-up h10::text').extract_first(0) fav_nums = response.css('.bookmark-btn::text').extract_first('').replace('收藏', '').strip() fav_nums = fav_nums if fav_nums != '' else 0 comment_nums = response.css("a[href='#article-comment'] span::text").extract_first('').replace('评论', '').strip() comment_nums = comment_nums if comment_nums != '' else 0 content = response.css('.entry').extract()[0] tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract() tag_list = [element for element in tag_list if not element.strip().endswith('评论')] tags = '-'.join(tag_list) article_item['remote_img_url'] = [remote_img_url] article_item['title'] = title article_item['publish_date'] = publish_date article_item['praise_nums'] = praise_nums article_item['fav_nums'] = fav_nums article_item['comment_nums'] = comment_nums article_item['content'] = content article_item['tags'] = tags yield article_item
def parse_detail(self, response): # title = response.css('div.entry-header h1::text').extract_first() # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first().replace('·', '').strip() # tags = response.css('p.entry-meta-hide-on-mobile a::text').extract() # tags = [tag for tag in tags if not tag.strip().endswith('评论')] # tag = '.'.join(tags) # # if response.css('div.post-adds span h10::text').extract_first(): # vote_num = int(response.css('div.post-adds span h10::text').extract_first()) # else: # vote_num = 0 # collect_num = response.css('div.post-adds span.bookmark-btn::text').extract_first() # if re.match('.*(\d+).*', collect_num): # collect_num = re.match('.*(\d+).*', collect_num).group(1) # else: # collect_num = 0 # # comment_num = response.css('a[href="#article-comment"] span::text').extract_first() # if re.match('.*(\d+).*', comment_num): # comment_num = re.match('.*(\d+).*', comment_num).group(1) # else: # comment_num = 0 # content = response.css('div.entry').extract_first() # item = JobboleArticleItem() # item["title"] = title # try: # create_date = datetime.datetime.strftime(create_date, '%Y-%m-%d') # except Exception as e: # create_date = datetime.datetime.now().date() # item["create_date"] = create_date # item["url"] = response.url # item["url_id"] = hashlib.md5(response.url.encode(encoding='utf-8')).hexdigest() # item["tag"] = tag # item["vote_num"] = vote_num # item["collect_num"] = collect_num # item["comment_num"] = comment_num # item["content"] = content # item["front_image_url"] = [img_url] img_url = response.meta.get("front_img_url") item_loader = MyItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_css("title", "div.entry-header h1::text") item_loader.add_css("tag", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("vote_num", "div.post-adds span h10::text") item_loader.add_css("collect_num", "div.post-adds span.bookmark-btn::text") item_loader.add_css("comment_num", 'a[href="#article-comment"] span::text') item_loader.add_css("content", "div.entry") item_loader.add_value("front_image_url", [img_url]) item_loader.add_value("url", response.url) item_loader.add_value( "url_id", hashlib.md5(response.url.encode(encoding='utf-8')).hexdigest()) item = item_loader.load_item() yield item pass
def parse_article(self, response): # title = response.css(".entry-header h1::text").extract()[0] CSS-selector写法 # time = response.xpath('//*[@id="post-114638"]/div[2]/p/text()').extract()[0].strip().replace(' ·','') # praise_num = int(response.xpath('//*[@id="post-114638"]/div[3]/div[5]/span[1]/h10/text()').extract()[0]) # favor_num = response.xpath('//*[@id="post-114638"]/div[3]/div[5]/span[2]/text()').extract()[0].strip() # match_info = re.match(r'.*(\d+).*', favor_num) # if match_info: # favor_num = match_info.group(1) # comment_num = response.xpath('//*[@id="post-114638"]/div[3]/div[5]/a/span/text()').extract()[0].strip() # match_info = re.match(r'.*(\d+).*', favor_num) # if match_info: # comment_num - match_info.group(1) # tags = response.xpath('//*[@id="post-114638"]/div[2]/p/a/text()').extract() # tags = ','.join(tags) # print(tags) title = response.xpath( "//*[@class='entry-header']/h1/text()").extract()[0].strip() time = response.css( ".entry-meta-hide-on-mobile::text")[0].extract().strip().replace( ' ·', '') try: create_date = datetime.datetime.strptime(time, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() tags = response.css(".entry-meta-hide-on-mobile a::text").extract() for tag in tags: if not re.match(r'\D.*', tag.strip()): tags.remove(tag) tags = ','.join(tags) article_data = response.css(".post-adds span") try: praise_num = self.extratc_num( article_data[0].css("h10::text").extract()[0]) favor_num = self.extratc_num( response.css(".post-adds span::text").extract()[2]) comment_num = self.extratc_num( response.css(".post-adds span::text").extract()[3]) except Exception as e: praise_num, favor_num, comment_num = None, None, None print(e) article_item = JobboleArticleItem() article_item['title'] = title article_item['create_date'] = create_date article_item['tags'] = tags article_item['url'] = response.url article_item['url_object_id'] = get_md5(response.url) article_item['praise_num'] = praise_num article_item['favor_num'] = favor_num article_item['comment_num'] = comment_num yield article_item # Very Important!!
def parse_detail(self, response): # article_item = JobboleArticleItem() # 提取文章的具体字段 # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', '') # fav_nums = response.xpath('//div[@class="post-adds"]/span[2]/h10/text()').extract_first() # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first() # content = response.xpath('//div[@class="entry"]').extract()[0] # tag_list = response.xpath('//div[@class="entry-meta"]/p/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content # 通过ItemLoader加载item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_xpath( 'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()') front_image_url = response.meta.get("front_image_url", "") item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_xpath('comment_nums', '//a[@href="#article-comment"]/span/text()') fav_nums = response.xpath( '//div[@class="post-adds"]/span[2]/h10/text()').extract_first() if fav_nums is None: fav_nums = '0' item_loader.add_value('fav_nums', fav_nums) item_loader.add_xpath('tags', '//div[@class="entry-meta"]/p/a/text()') item_loader.add_xpath('content', '//div[@class="entry"]') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): jobboleItem = JobboleArticleItem() jobboleItem["title"] = response.xpath( '//*[@class="entry-header"]/h1/text()')[0].extract() jobboleItem["time"] = response.xpath( '//*[@class="entry-meta-hide-on-mobile"]/text()').extract( )[0].replace('·', '').strip() jobboleItem["vote"] = response.xpath( '//*[@class="post-adds"]/span[1]/h10/text()').extract()[0] jobboleItem["url"] = response.url jobboleItem["img_url"] = response.meta["front_img_url"] # print("title = \"" + jobboleItem["title"] + "\" time=\"" + jobboleItem["time"] + "\"\n") yield jobboleItem
def parse_detail(self, response): article_item = JobboleArticleItem() # 提取文章的具体字段 title = response.css(".entry-header h1::text").extract() create_date = response.css( "p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace( "·", "").strip() praise_nums = response.css("span.vote-post-up h10::text").extract()[0] front_img_url = response.meta.get("front_img_url", "") # 文章封面图 fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(".*(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css( "a[href='#article-comment'] span").extract()[0] match_re = re.match(".*(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css(".entry").extract() tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract() tag_list = [ element for element in tag_list if not element.strip().endswith("评论") ] tags = ",".join(tag_list) article_item["title"] = title article_item["url"] = response.url article_item["url_object_id"] = get_md5(response.url) article_item["create_date"] = create_date article_item["front_img_url"] = [front_img_url] article_item["praise_nums"] = praise_nums article_item["fav_nums"] = fav_nums article_item["comment_nums"] = comment_nums article_item["tags"] = tags article_item["content"] = content yield article_item
def parse_detail(self, response): # 通过item loader加载item item_loader = TakeFirstItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_value("table_name", "jobbole_article") item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [response.meta.get("front_image_url", "")]) item_loader.add_css("create_date", ".entry-meta-hide-on-mobile::text") item_loader.add_css("praise_nums", "span.vote-post-up h10::text") item_loader.add_css("fav_nums", "span.bookmark-btn::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("tag", ".entry-meta-hide-on-mobile a::text") item_loader.add_css("content", ".entry") article_item = item_loader.load_item() yield article_item
def parse_article(self, response): # 解析文章 img_url = response.meta.get("img_url", "") itemLoader = mArticleItemLoader(item=JobboleArticleItem(), response=response) itemLoader.add_css("title", ".entry-header h1::text") itemLoader.add_css("create_time", ".entry-meta-hide-on-mobile::text") itemLoader.add_css("mark_nums", ".bookmark-btn::text") itemLoader.add_css("comment_nums", ".btn-bluet-bigger.href-style.hide-on-480::text") itemLoader.add_css("vote_nums", ".vote-post-up h10::text") itemLoader.add_css("content", ".entry") itemLoader.add_value("crawl_time", datetime.now().strftime("%Y/%m/%d")) itemLoader.add_value("url", response.url) itemLoader.add_value("img_url", [img_url]) itemLoader.add_value("object_id", get_md5(response.url)) item = itemLoader.load_item() yield item
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: article_item = JobboleArticleItem() title = response.xpath( '//div[@id="news_title"]/a/text()').extract_first("") create_time = response.xpath( '//div[@id="news_info"]/span[@class="time"]/text()' ).extract_first("") content = response.xpath('//div[@id="news_body"]').extract()[0] tags = ",".join( response.xpath( '//div[@id="news_more_info"]/div[@class="news_tags"]/a/text()' ).extract()) post_id = match_re.group(1) info_url = parse.urljoin( response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)) article_item["title"] = title article_item["create_date"] = create_time article_item["content"] = content article_item["tags"] = tags article_item["url"] = response.url article_item["url_object_id"] = common.get_md5(response.url) article_item["front_img_url"] = [ response.meta.get("front_img_url", "") ] yield Request(url=info_url, meta={ "article_item": article_item, "url": response.url }, callback=self.parse_num)
def parse_detail(self, response): article_item = JobboleArticleItem() # 提取文章的具体字段 # 通过xpath提取文章具体字段 """ title = response.xpath('//*[@id="post-112051"]/div[1]/h1/text()').extract_first("") #extract_first("")提取不到,返回为空 create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace(" ·","") praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] match_re = re.match(r".*?(\d+).*", fav_nums) if match_re : fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first("") match_re = re.match(r".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 #没有评论 content = response.xpath("//div[@class='entry']").extract()[0] tags = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() tag_list = [elment for elment in tags if not elment.strip().endswith("评论")] tags = ",".join(tag_list) print(title, create_date, praise_nums, comment_nums, tags) """ """ # 通过css选择器提取字段 front_image_url = response.meta.get("front_image_url", "") # 文章封面图 title = response.css(".entry-header h1::text").extract_first() create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace(" ·","") praise_nums = response.css(".vote-post-up h10::text").extract()[0] fav_nums = response.css("span.bookmark-btn::text").extract()[0] match_re = re.match(r".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] match_re = re.match(r".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 # 没有评论 content = response.css("div.entry").extract()[0] tags = response.css("p.entry-meta-hide-on-mobile a::text").extract() tag_list = [elment for elment in tags if not elment.strip().endswith("评论")] tags = ",".join(tag_list) print(title, create_date, praise_nums, comment_nums, tags) # 给Item填充值 article_item["title"] = title try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as error: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["url"] = response.url article_item["url_object_id"] = get_md5(response.url) article_item["front_image_url"] = [front_image_url] # 改为数组 article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content """ """ 通过item Loader来加载Item ----> 在以后的开发中都是用ItemLoader来解析值 """ front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", "span.bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() """ 传递到pipeline类里面去 """ yield article_item
def parse_detail(self, response): # 提取文章的具体字段 # article_item = JobboleArticleItem() ''' # xpath title = response.xpath('//*[@id="post-114159"]/div[1]/h1/text()').extract()[0] create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().strip().replace("·", "").strip() praise_nums = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]) fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] match_re = re.match(r".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] match_re = re.match(r".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.xpath("//div[@class='entry']").extract()[0] tags_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() tags_list = [element for element in tags_list if not element.strip().endswith("评论")] tags = ','.join(tags_list) ''' ''' # css front_image_url = response.meta.get("front_image_url", "") # 封面图,用get不会抛异常 title = response.css(".entry-header h1::text").extract()[0] create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().strip().replace("·", "").strip() try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() praise_nums = int(response.css(".vote-post-up h10::text").extract()[0]) fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(r".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] match_re = re.match(r".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] tags_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() tags_list = [element for element in tags_list if not element.strip().endswith("评论")] tags = ','.join(tags_list) article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content ''' # Itemloader加载item front_image_url = response.meta.get("front_image_url", "") # 封面图,用get不会抛异常 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css('title', ".entry-header h1::text") item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('url', response.url) item_loader.add_css('create_date', "p.entry-meta-hide-on-mobile::text") item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_css('praise_nums', ".vote-post-up h10::text") item_loader.add_css('comment_nums', "a[href='#article-comment'] span::text") item_loader.add_css('fav_nums', ".bookmark-btn::text") item_loader.add_css('tags', "p.entry-meta-hide-on-mobile a::text") item_loader.add_css('content', "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # 提取文章的具体字段 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first() # # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().replace("·", "").strip() # try: # create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d') # except Exception as e: # create_date = datetime.datetime.now() # # praise_nums = int(response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first()) # # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract_first() # match_re = re.match(r".*(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first() # match_re = re.match(r".*(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # # content = response.xpath("//div[@class='entry']").extract_first() # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item = JobboleArticleItem() # article_item['url_object_id'] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # 通过ItemLoader加载item front_image_url = response.meta.get("front_image_url", "") #文章封面图 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_xpath('title', "//div[@class='entry-header']/h1/text()") item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_xpath( 'create_date', "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_xpath( 'praise_nums', "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath('comment_nums', "//a[@href='#article-comment']/span/text()") item_loader.add_xpath( 'fav_nums', "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath( 'tags', "//p[@class='entry-meta-hide-on-mobile']/a/text()") # item_loader.add_xpath('content', "//div[@class='entry']") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # article_item = JobboleArticleItem() # # front_img_url = response.meta.get('front_img_url', '') # print(front_img_url) # # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace('·', '').strip() # try: # create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # # thumb_up = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # if thumb_up: # thumb_up = int(thumb_up) # else: # thumb_up = 0 # # save_num = 0 # save_text = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # save_pattern = r'(.*)收藏' # save_match = re.match(save_pattern,save_text) # if save_match: # save_target = save_match.group(1).strip() # if save_target: # save_num = int(save_target) # # comment_num = 0 # comment_text = response.xpath("//a[@href = '#article-comment']/span/text()").extract()[0] # comment_pattern = r'(.*)评论' # comment_match = re.match(comment_pattern, comment_text) # if comment_match: # comment_target = comment_match.group(1).strip() # if comment_target: # comment_num = int(comment_target) # # content = response.xpath("//div[@class = 'entry']").extract()[0] # # tag_raw = response.css(".entry-meta-hide-on-mobile > a ::text").extract() # tag = ','.join([element for element in tag_raw if '评论' not in element]) # # article_item['url'] = response.url # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['create_date'] = create_date # article_item['thumb_up'] = thumb_up # article_item['save_num'] = save_num # article_item['comment_num'] = comment_num # article_item['content'] = content # article_item['tag'] = tag # article_item['front_img_url'] = [front_img_url] item_loader = ItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', response.url) item_loader.add_value('front_img_url', response.meta.get('front_img_url', '')) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_xpath('create_date', "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_xpath('thumb_up', "//span[contains(@class, 'vote-post-up')]/h10/text()") item_loader.add_xpath('save_num', "//span[contains(@class, 'bookmark-btn')]/text()") item_loader.add_xpath('comment_num', "//a[@href = '#article-comment']/span/text()") item_loader.add_xpath('content', "//div[@class = 'entry']") item_loader.add_css('tag', ".entry-meta-hide-on-mobile > a ::text") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobboleArticleItem() # 提取文章的具体字段 # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css(".entry-meta-hide-on-mobile::text").extract()[0].replace("·", "").strip() # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(".*(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # article_item["front_image_url"] = [front_image_url] # try: # create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["praise_nums"] = praise_nums # article_item["fav_nums"] = fav_nums # article_item["comment_nums"] = comment_nums # article_item["content"] = content # article_item["tags"] = tags # 通过item loader 加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text') item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('fav_nums', '.bookmark-btn::text ') item_loader.add_css('comment_nums', "a[href='#article-comment'] span::text") item_loader.add_css('content', 'div.entry') item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text') item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('url', response.url) item_loader.add_value('front_image_url', [front_image_url]) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): #title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first() # create_time = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().strip().replace("·","").strip() # praise_number = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first() # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract_first() # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first() # content = response.xpath("//div[@class='entry']").extract_first() # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() #article_item = JobboleArticleItem() # title = response.css("div.entry-header h1::text").extract_first() # create_time = response.css("p.entry-meta-hide-on-mobile::text").extract_first().strip().replace("·", "").strip() # praise_number = response.css("span.vote-post-up h10::text").extract_first() # fav_nums = response.css("span.bookmark-btn::text").extract_first() # fav_nums = re.findall("\d", fav_nums) # if fav_nums: # fav_nums = fav_nums[0] # else: # fav_nums = 0 # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first() # comment_nums = re.findall("\d", comment_nums) # if comment_nums: # comment_nums = comment_nums[0] # else: # comment_nums = 0 # content = response.css("div.entry").extract_first() # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract_first() # tags = ",".join(tag_list) # article_item["title"] = title # article_item["create_time"] = create_time # article_item["url"] = response.url # article_item["praise_number"] = praise_number # article_item["fav_nums"] = fav_nums # article_item["comment_nums"] = comment_nums # article_item["content"] = content # article_item["front_image_url"] = [front_image_url] # article_item["tags"] = tags # article_item["url_object_id"] = get_md5(response.url) #itemloader 加载item # item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) # front_image_url = response.meta.get("front_image_url", "") # item_loader.add_css("title", "div.entry-header h1::text") # item_loader.add_value("front_image_url", [front_image_url]) # item_loader.add_value("url", response.url) # item_loader.add_value("url_object_id", get_md5(response.url)) # item_loader.add_css("praise_number", "span.vote-post-up h10::text") # item_loader.add_css("create_time", "p.entry-meta-hide-on-mobile::text") # item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") # item_loader.add_css("content", "div.entry") # item_loader.add_css("fav_nums", "span.bookmark-btn::text") # item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") # article_item = item_loader.load_item() # yield article_item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) front_image_url = response.meta.get("front_image_url", "") item_loader.add_css("title", "div.entry-header h1::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("praise_number", "span.vote-post-up h10::text") item_loader.add_css("create_time", "p.entry-meta-hider-on-mobile::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") item_loader.add_css("fav_nums", "span.bookmark-btn::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # 通过xpath选择器提取字段 # title = response.xpath('//*[@class="entry-header"]/h1/text()').extract()[0].strip().replace('·','').strip() # create_date = response.xpath('//*[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace('·','').strip() # praise_num = int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0]) # fav_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0] # fav_nums_match = re.match('.*(\d)+.*',fav_nums) # if(fav_nums_match): # fav_nums = int(fav_nums_match.group(1)) # else: # fav_nums = int(0) # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0] # comment_nums_match = re.match('.*(\d)+.*',comment_nums) # if(comment_nums_match): # comment_nums = int(comment_nums_match.group(1)) # else: # comment_nums = int(0) # content = response.xpath('//div[@class="entry"]').extract()[0] # tags_list = response.xpath('//*[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tags_list = [element for element in tags_list if not element.strip().endswith("评论")] # tags = ','.join(tags_list) # article_item = JobboleArticleItem() # # #通过css选择器提取字段 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # # title = response.css('.entry-header h1::text').extract()[0].strip().replace('·','').strip() # # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·','').strip() # # praise_nums = int(response.css('span.vote-post-up h10::text').extract()[0]) # # fav_nums = response.css('span.bookmark-btn::text').extract()[0] # fav_nums_match = re.match('.*(\d)+.*', fav_nums) # if (fav_nums_match): # fav_nums = int(fav_nums_match.group(1)) # else: # fav_nums = int(0) # # comment_nums = response.css('a[href="#article-comment"] span::text').extract()[0] # comment_nums_match = re.match('.*(\d)+.*', comment_nums) # if (comment_nums_match): # comment_nums = int(comment_nums_match.group(1)) # else: # comment_nums = int(0) # # content = response.css('div.entry').extract()[0] # # tags_list = response.css('p.entry-meta-hide-on-mobile a::text').extract() # tags_list = [element for element in tags_list if not element.strip().endswith("评论")] # tags = ','.join(tags_list) # # article_item["title"] = title # article_item["url"] = response.url # article_item["url_object_id"] = common.md5(response.url) # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # 通过css选择器提取字段 front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # 通过ItemLoader加载Item iter_loader = ArticlespiderItemLoader(item=JobboleArticleItem(), response=response) iter_loader.add_css("title", ".entry-header h1::text") iter_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") iter_loader.add_css("praise_nums", "span.vote-post-up h10::text") iter_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") iter_loader.add_css("fav_nums", "span.bookmark-btn::text") iter_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") iter_loader.add_css("content", "div.entry") iter_loader.add_value("url", response.url) iter_loader.add_value("url_object_id", common.md5(response.url)) iter_loader.add_value("front_image_url", [front_image_url]) article_item = iter_loader.load_item() yield article_item
def parse_detail(self, response): # 提取文章的具体字段 # 通过xpath提取字段 # title = response.xpath("//div[@class='entry-header']/h1/text()").extract()[0] # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace('·','').strip() # praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0] # # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn ')]/text()").extract()[0] # fav_nums = re.findall('\d+', fav_nums)[0] # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # comment_nums = re.findall('\d+', comment_nums)[0] # # # 保存带有所有标签的原文 # content = response.xpath("//div[@class='entry']").extract()[0] # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # # 用列表生成式去掉评论的标签 # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # # 把列表转换成指定格式的字符串保存 # tags = ','.join(tag_list) # 通过CSS选择器提取字段 # front_image_url = response.meta.get('front_image_url', '') # title = response.css('.entry-header h1::text').extract()[0] # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace(' ·', '') # praise_nums = response.css('.vote-post-up h10::text').extract()[0] # if praise_nums: # praise_nums = int(praise_nums) # else: # praise_nums = 0 # # fav_nums = response.css('span.bookmark-btn::text').extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['fav_nums'] = fav_nums # article_item['comment_nums'] = comment_nums # article_item['tags'] = tags # article_item['content'] = content article_item = JobboleArticleItem() # 通过 item loader 加载 item front_image_url = response.meta.get('front_image_url', '') # 文章封面图 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('fav_nums', 'span.bookmark-btn::text') item_loader.add_css('comment_nums', "a[href='#article-comment'] span::text") item_loader.add_css('content', "div.entry") item_loader.add_css('tags', "p.entry-meta-hide-on-mobile a::text") item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('url', response.url) item_loader.add_value('front_image_url', [front_image_url]) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): front_image_url = response.meta.get( "front_image_url") # 文章封面图,request传进来的 #提取文章的具体字段 # re_title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first(default='not-found') # re_time = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "").strip() # praise_num = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0] # if praise_num == "": # praise_num = 0 # fav_num = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0].replace("收藏", "").strip() # if fav_num == "": # fav_num = 0 # comment_num = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0].replace("评论", "").strip() # if comment_num == "": # comment_num = 0 # content = response.xpath("//div[@class='entry']").extract()[0] # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tag = ",".join(tag_list) # # # #通过css选择器 # # title = response.css(".entry-header h1::text").extract() # # time = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "") # # praise_num = response.css(".vote-post-up h10::text").extract()[0] # # fav_num = response.css(".bookmark-btn::text").extract()[0].strip().replace("收藏", "").strip() # # comment_num = response.css("a[href='#article-comment'] span::text").extract()[0].strip().replace("评论", "").strip() # # # #在items中填充值 # article_item = JobboleArticleItem() # # article_item["re_title"] = re_title # article_item["url_object_id"] = get_md5(response.url) # article_item["front_image_url"] = [front_image_url] # article_item["front_image_url_db"] = front_image_url # try: # re_time = datetime.datetime.strptime(re_time, '%Y/%m/%d').date() # except Exception as e: # re_time = datetime.datetime.now().date() # article_item["re_time"] = re_time # article_item["praise_num"] = praise_num # article_item["fav_num"] = fav_num # article_item["comment_num"] = comment_num # article_item["content"] = content # article_item["tag"] = tag # article_item["url"] = response.url # 通过Itemloader加载item item_loader = AticleItmeLoad(item=JobboleArticleItem(), response=response) # 直接加值 item_loader.add_value("url", response.url) item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("front_image_url_db", front_image_url) item_loader.add_value("url_object_id", get_md5(response.url)) #提取后填充 item_loader.add_xpath("re_title", "//div[@class='entry-header']/h1/text()") item_loader.add_xpath("content", "//div[@class='entry']") item_loader.add_xpath( "praise_num", "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath( "fav_num", "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath( "praise_num", "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath("comment_num", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath( "tag", "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loader.add_css("re_time", "p.entry-meta-hide-on-mobile::text") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobboleArticleItem() # 提取文章的具体字段 # title = response.xpath('//*[@id="post-113158"]/div[1]/h1/text()').extract_first("") # create_date = response.xpath('//*[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace("·", "").strip() # praise_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0] # fav_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0] # match_re = re.match(r".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0] # match_re = re.match(r".*?(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # content = response.xpath('//div[@class="entry"]').extract()[0] # tag_list = response.xpath('//*[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith(u"评论")] # tags = ",".join(tag_list) # 通过css选择器提取 front_image_url = response.meta.get("front_image_url", "") # 文章封面图 title = response.css(".entry-header h1::text").extract()[0] create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "").strip() praise_nums = response.css("span.vote-post-up h10::text").extract()[0] fav_nums = response.css("span.bookmark-btn::text").extract()[0] match_re = re.match(r".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] match_re = re.match(r".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] tag_list = response.css('p.entry-meta-hide-on-mobile a::text').extract() tag_list = [element for element in tag_list if not element.strip().endswith(u"评论")] tags = ",".join(tag_list) article_item['url_object_id'] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content yield article_item
def parse_detail(self, response): # DOM元素id是全局唯一的; # scrapy 获取的源码是未执行js动态生成时的html源码(相当于browser中查看源码); # 而直接从browser通过开发者工具xpath Copy拿到的xpath是基于js执行动态生成后的html; # 因此通过从browser复制得到类似/div[1]/div[2]...这样的xpath有时是不靠谱的; # 而类似"*[@id="..."]"包含id 或者 "*[@class="header"]/..."包含实际内容class的 这样的比较靠谱 # re_selector = response.xpath('//*[@id="post-114442"]/div[1]/h1') # response.xpath(...) 返回的是SelectorList; # 此处Selector元素[0]:<Selector xpath='//*[@class="entry-meta"]/p[1]/text()' data='\r\n\r\n 2018/10/15 · '> # (Selector or SelectorList) .extract() 返回( data or dataList) coverImg = response.meta.get("coverImg", "") title = response.css(".entry-header h1::text").extract_first() date = response.xpath( '//*[@class="entry-meta"]/p[1]/text()')[0].extract().replace( '·', '').strip() # < span data - post - id = "114442"class =" btn-bluet-bigger href-style vote-post-up register-user-only " > < i class ="fa fa-thumbs-o-up" > < / i > < h10 id="114442votetotal" > 1 < / h10 > 赞 < / span > thumbUp = int( response.xpath( "//span[contains(@class, 'vote-post-up')]/h10/text()").extract( )[0]) favSpan = response.xpath( "//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] favMatch = re.match(".*?(\d+).*", favSpan) if favMatch: favNum = int(favMatch.group(1)) else: favNum = 0 comment = response.xpath( "//a[@href='#article-comment']/span/text()").extract()[0] commentMatch = re.match('.*?(\d+).*', comment) if commentMatch: commentNum = int(commentMatch.group(1)) else: commentNum = 0 tagList = response.xpath( "//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # Delete "k 评论" tag tagList = [ element for element in tagList if not (element.strip().endswith("评论")) ] tags = ",".join(tagList) contentList = response.css("div.entry *::text").extract() content = " ".join(contentList) content = content.replace("\t", " ").replace("\n", " ").replace("\r", " ") content = " ".join(content.split()) item = JobboleArticleItem() item["title"] = title try: create_date = datetime.datetime.strptime(date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() item["date"] = create_date item["url"] = response.url # item["urlObjId"] # item[url] would be processed as a list in "pipelines" item["coverImgUrl"] = [coverImg] item["thumbUp"] = thumbUp item["favNum"] = favNum item["commentNum"] = commentNum item["tags"] = tags item["content"] = content item["urlObjId"] = get_md5(response.url) #yield send item to pipeline if "settings" enable pipleline yield item pass