def parse_detail(self, response): #提取文章的具体字段 #返回为SelectorList,方便进一步做selector筛选(嵌套selector) #front_image_url = response.meta['front_image_url'] #使用get方法可以传递默认值,建议使用 articleItem = ArticlespiderItem() #实例化 #raise ValueError('Missing scheme in request url: %s' % self._url)改为列表 # articleItem['front_image_url'] = [response.meta.get('front_image_url','')] # articleItem['title'] = response.xpath('//a[@id="cb_post_title_url"]/text()').extract_first("") # articleItem['url_object_id'] = get_md5(response.url) # #假设字段中有日期类型的,将爬取下来的字符串转化为日期 # try # date = datetime.datetime.strptime(date, "%Y/%m/%d").date() # except Exception as e: # date = datetime.datetime.now().date() #获取当前日期 #通过itemLoader加载item item_loader = ItemLoader(item=ArticlespiderItem(), response=response) #item_loader.add_css() item_loader.add_xpath('title', '//a[@id="cb_post_title_url"]/text()') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('front_image_url', [response.meta.get('front_image_url','')]) articleItem = item_loader.load_item() #articleItem里面所有的项默认都是list yield articleItem
def parse_details(self, response): news = ArticlespiderItem() img_url = response.meta.get("img_url", "") title = response.css(".entry-header h1::text").extract_first("") datetime = response.css(".entry-meta-hide-on-mobile::text").extract_first("").replace(" ·", "").strip() praise_num_str = response.css("#114676votetotal::text").extract_first("") if praise_num_str: praise_num = int(praise_num_str) else: praise_num = 0 content = response.css(".entry p::text").extract()[0] news['title'] = title news['datetime'] = datetime news['praise_num'] = praise_num news['content'] = content news['img_url'] = [img_url] news['url_object_id'] = get_md5(img_url) item_loader=ItemLoader(item=ArticlespiderItem,response=response) item_loader.add_css() item_loader.add_xpath() item_loader.add_value() yield news
def detail(self, response): # item = ArticlespiderItem() # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() # pubtime = response.xpath('//*[@id="post-113778"]/div[2]/p/text()').extract_first().strip().split(" ")[0] # tag_list = response.xpath('//*[@id="post-113778"]/div[2]/p/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = "·".join(tag_list) # print("xpath选择器", title, pubtime, tags) # # title = response.css('div.entry-header h1::text').extract_first() # pubtime = response.css('.entry-meta-hide-on-mobile::text').extract()[0].strip().split(" ")[0] # try: # pubtime = datetime.datetime.strptime(pubtime, '%Y/%m/%d').date() # except: # pubtime = datetime.datetime.now().date() # tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = "·".join(tag_list) # # print("css 选择器", title, pubtime, tags) # # all_num = response.css('.post-adds') # praise_num = all_num.css('#113778votetotal::text').extract_first(0) # fav_num = all_num.css('.bookmark-btn::text').extract_first("0") # match_re = re.match('.*?(\d+).*', fav_num) # if match_re: # fav_num = int(match_re.group(1)) # else: # fav_num = 0 # comment_num = all_num.css('.fa.fa-comments-o::text').extract_first(0) # content = response.css('.entry p::text').extract() # contents = "__".join([element for element in content if element]) # item['title'] = title # item['pubtime'] = pubtime # item['tags'] = tags # item['praise_num'] = praise_num # item['fav_num'] = fav_num # item['comment_num'] = comment_num # item['contents'] = contents # item['image_urls'] = response.meta.get('image_urls', "") # item['url_object_id'] = get_md5(response.url) # item['url'] = response.url # ItemLoader item_loader = ArticleItemLoder(item=ArticlespiderItem(), response=response) item_loader.add_css('title', 'div.entry-header h1::text') item_loader.add_css('pubtime', '.entry-meta-hide-on-mobile::text') item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text') item_loader.add_css('praise_num', '.post-adds span h10::text') item_loader.add_css('fav_num', '.bookmark-btn::text') item_loader.add_css('comment_num', 'a[href="#article-comment"] span::text') item_loader.add_css('content', 'div.entry p::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('image_urls', [response.meta.get("image_urls", '')]) article_item = item_loader.load_item() return article_item
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: article_item = ArticlespiderItem() title = response.css("#news_title a::text").extract_first("") create_date = response.css("#news_info .time::text").extract_first( "") match_create_date = re.match('.*?(\d+.*)', create_date) if match_create_date: create_date = match_create_date.group(1) content = response.css("#news_content").extract() tag_list = response.css(".news_tags a::text").extract() tags = ','.join(tag_list) article_item['title'] = title article_item['create_date'] = create_date article_item['content'] = content article_item['tags'] = tags if response.meta.get('front_image_url', ''): article_item['front_image_url'] = [ response.meta.get('front_image_url', '') ] else: article_item['front_image_url'] = [] article_item['url'] = response.url post_id = match_re.group(1) # requests是同步的库,可以放在yield中 yield Request(url=parse.urljoin( response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={'article_item': article_item}, callback=self.parse_nums)
def parse_detail(self, response): article_item = ArticlespiderItem() front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=ArticlespiderItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5.get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("parise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) # article_item = ArticlespiderItem() # title = response.css("#news_title a::text").extract_first("") # # title = response.xpath("//*[@id='news_title'//a/text()") # create_time = response.css("#news_info .time::text").extract_first("") # # create_date = response.xpath("//*[@id='news_info'//*[@class='time']/text()") # match_re = re.match(".*?(\d+.*)", create_time) # if match_re: # create_time = match_re.group(1) # content = response.css("#news_content").extract()[0] # # content = response.xpath("//*[@id='news_content']").extract()[0] # tag_list = response.css(".news_tags a::text").extract() # 无法存储list # # tag_list = response.xpath("//*[@class=news_tags']//a/text()") # tags = ",".join(tag_list) # post_id = match_re.group(1) # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) # 尽量不用同步的库 # 打断点看是否符合要求 # url路径 / 可以避免加入到子路径 # j_data = json.loads(html.text) # article_item['title'] = title # article_item['create_time'] = create_time # article_item['content'] = content # article_item['tags'] = tags # # article_item['url'] = response.url # if response.meta.get("front_image_ur;", ""): # article_item['front_image_url'] = [response.meta.get('front_image_url', "")] # else: # article_item['front_image_url'] = [] # 使用itemloader的代码,使得程序可以更加易于维护 匹配项 item_loader = ArticleItemLoader(item=ArticlespiderItem(), response=response) item_loader.add_css("title", "#news_title a::text") item_loader.add_css("content", "#news_content") item_loader.add_css("tags", ".news_tags a::text") item_loader.add_css("create_time", "#news_info .time::text") item_loader.add_value("url", response.url) if response.meta.get('front_image_url', []): item_loader.add_value('front_image_url', response.meta.get('front_image_url', [])) article_item = item_loader.load_item() if response.meta.get("front_image_ur;", ""): article_item['front_image_url'] = [response.meta.get('front_image_url', "")] else: article_item['front_image_url'] = [] yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"article_item": article_item}, callback=self.parse_nums) # praise_nums = j_data['DiggCount'] # fav_nums = j_data['TotalView'] # comment_nums = j_data['CommentCount'] pass
def parse_detail(self, response): front_img_url = response.meta.get('img_url', '') title = response.xpath( '//div[@class="entry-header"]/h1/text()').extract_first() create_date = response.xpath( '//p[@class="entry-meta-hide-on-mobile"]/text()').extract( )[0].strip().replace('·', '').strip() praise_num = int(response.xpath('//h10/text()').extract_first()) # 收藏数文字串,如:39收藏 fav_str = response.xpath( '//div[@class="post-adds"]//span[contains(@class,"bookmark-btn")]/text()' ).extract_first().strip() fav_regex_str = '.*?(\d+).*' fav_match_res = re.match(fav_regex_str, fav_str) if fav_match_res: fav_num = fav_match_res.group(1) else: fav_num = 0 # 评论数的文字串,如:890评论 comment_str = response.xpath( '//a[@href="#article-comment"]/span[contains(@class,"href-style")]/text()' ).extract_first().strip() comment_regex_str = '.*?(\d+).*' comment_match_res = re.match(comment_regex_str, comment_str) if comment_match_res: comment_num = comment_match_res.group(1) else: comment_num = 0 tags_list = response.xpath( '//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract( ) # 结果是列表,如 ['IT技术', '开发'] tags = ','.join(tags_list) content = '\n'.join( response.xpath('//div[@class="entry"]/p/text()').extract() ) # 结果是列表,如 ['IT技术', '开发'] try: create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() except Exception as e: create_date = datetime.datetime.date() article_item = ArticlespiderItem() article_item['title'] = title article_item['create_date'] = create_date article_item['url'] = response.url article_item['url_object_id'] = get_md5(response.url) article_item['front_img_url'] = [front_img_url] article_item['front_img_path'] = [IMAGES_STORE] article_item['praise_num'] = praise_num article_item['fav_num'] = fav_num article_item['comment_num'] = comment_num article_item['tags'] = tags article_item['content'] = content yield article_item
def parse_detail(self, response): # 标题 title = response.xpath('//div[@class="entry-header"]/h1/text()').extract() # 时间 crttime_content = response.xpath('//div[@class="entry-meta"]/p/text()').extract() if len(crttime_content) == 0: create_time = 'no' else: create_time = crttime_content[0].replace('·', '').strip() # 文章类别 article_kind_content = response.xpath('//div[@class="entry-meta"]/p/a/text()').extract() if len(article_kind_content) == 0: article_kind = 0 else: article_kind = article_kind_content[0] # 点赞数 praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # 收藏数 fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] match_re = re.match(".*(\d+).*",fav_nums) if match_re: fav_nums = match_re.group(1) else: fav_nums = 0 # 评论数 commant_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] match_re = re.match(".*(\d+).*", commant_nums) if match_re: commant_nums = match_re.group(1) else: commant_nums = 0 #内容 # content = response.xpath("//div[@class='entry']").extract() # 作者姓名 author_name_content = response.xpath("//div[@id='author-bio']//a/text()").extract() if len(author_name_content) == 0: author_name = 'no' else: author_name = author_name_content[0] item_loader = ArticleItemLoader(item=ArticlespiderItem(), response=response) item_loader.add_value('url', response.url) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_value('create_time', [create_time]) item_loader.add_value('article_kind', [article_kind]) item_loader.add_value('praise_nums', [praise_nums]) item_loader.add_value('fav_nums', [fav_nums]) item_loader.add_value('commant_nums', [commant_nums]) #item_loader.add_value('content', [content]) item_loader.add_value('author_name', [author_name]) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = ArticlespiderItem() # 通过css选择器提取字段 front_image_url = response.meta.get("front_image_url", "") #文章封面图 title = response.css(".entry-header h1::text").extract()[0] create_date = response.css( "p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace( "·", "").strip() praise_nums = response.css(".vote-post-up h10::text").extract()[0] fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css( "a[href='#article-comment'] span::text").extract()[0] match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] tag_list = response.css( "p.entry-meta-hide-on-mobile a::text").extract() tag_list = [ element for element in tag_list if not element.strip().endswith("评论") ] tags = ",".join(tag_list) #article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now() # article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content yield article_item
def _parse_data(self, response): # # 文章item # article = ArticlespiderItem() # # response对象的body属性就是需要解析的html_doc # create_selector = response.xpath('//*[@class="entry-meta-hide-on-mobile"]/text()') # # 文章标题 # title = response.xpath("//div[@class='entry-header']/h1/text()").extract() # # 文章创建时间 # create_date = create_selector.extract()[0].strip().replace("·", "") # coll = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0] # coll_re = re.compile(r".*?(\d+?).*") # coll_match = coll_re.match(coll) # # 收藏数 # stars = 0 # if coll_match is not None: # stats = int(coll_match.group(1)) # # 点赞数 # try : # thumb_ups = int(response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]) # except Exception as e: # thumb_ups = 0 # # 评论数 # comments = 0 # comm_re = re.compile(r".*?(\d+?).+") # comm_match = comm_re.match(response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]) # if comm_match is not None: # comments = int(comm_match.group(1)) # # 正文 # content = response.xpath("//div[@class='entry']").extract() # # tag # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tags = ",".join([tag for tag in tag_list if not tag.strip().endswith('评论')]) # # article['title'] = title # article['create_date'] = create_date # article['url'] = response.url # article['url_object_id'] = security.get_md5(response.url) # article['main_image_url'] = [response.meta.get("main_image_url", "")] # article['content'] = content # article['thumb_ups'] = thumb_ups # article['comments'] = comments # article['stars'] = stars # article['tags'] = tags # scrapy loader 对比上面... loader = ArticleItemLoader(item=ArticlespiderItem(), response=response) loader.add_value("url", response.url) loader.add_value("url_object_id", security.get_md5(response.url)) loader.add_value("main_image_url", [response.meta.get("main_image_url", "")]) loader.add_xpath("title", "//div[@class='entry-header']/h1/text()") loader.add_xpath("create_date", '//*[@class="entry-meta-hide-on-mobile"]/text()') loader.add_xpath("stars", '//span[contains(@class,"bookmark-btn")]/text()') loader.add_xpath("thumb_ups", "//span[contains(@class,'vote-post-up')]/h10/text()") loader.add_xpath("comments", "//a[@href='#article-comment']/span/text()") loader.add_xpath("content", "//div[@class='entry']") loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") # 返回的item需要通过处理函数才能转换成我们想要的类型 article = loader.load_item() # return to pipelines yield article
# -*- coding: utf-8 -*-