def parse_detail(self, response): # item = JobboleItem() # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first() # create_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace('·','') # content = response.xpath('//div[@class="entry"]').extract_first() # tag_lst = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_lst = [element for element in tag_lst] # tags = ",".join(tag_lst) # # item['url_obj_id'] = get_md5(response.url) # item['url'] = response.url # item['title'] = title # item['create_time'] = create_time # item['tags'] = tags # item['content'] = content item_loader = MyItemLoader(item=JobboleItem(), response=response) item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()") item_loader.add_xpath( "create_time", '//p[@class="entry-meta-hide-on-mobile"]/text()') item_loader.add_value("url", response.url) item_loader.add_value("url_obj_id", get_md5(response.url)) item_loader.add_xpath( "tags", '//p[@class="entry-meta-hide-on-mobile"]/a/text()') item_loader.add_xpath("content", '//div[@class="entry"]') item = item_loader.load_item() yield item
def parse_detail(self, response): article_item = JobboleItem() #文章封面图地址 front_image_url = response.meta.get("front_image_url", "") title = response.xpath( '//div[@class="entry-header"]/h1/text()').extract_first() create_date = response.xpath( '//p[@class="entry-meta-hide-on-mobile"]/text()').extract( )[0].strip().split()[0] tag_list = response.xpath( '//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() tag_list = [ element for element in tag_list if not element.strip().endswith("评论") ] tag = ",".join(tag_list) praise_nums = response.xpath( '//span[contains(@class,"vote-post-up")]/h10/text()').extract() if len(praise_nums) == 0: praise_nums = 0 else: praise_nums = int(praise_nums[0]) fav_nums = response.xpath( '//span[contains(@class,"bookmark-btn")]/text()').extract()[0] match_re = re.match(".*(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.xpath( "//a[@href='#article-comment']/span/text()").extract()[0] match_com = re.match(".*(\d+).*", comment_nums) if match_com: comment_nums = int(match_com.group(1)) else: comment_nums = 0 content = response.xpath('//div[@class="entry"]').extract()[0] article_item["url_object_id"] = get_md5(response.url) #这里对地址进行了md5变成定长 article_item["title"] = title article_item["url"] = response.url try: create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = int(praise_nums) article_item["fav_nums"] = fav_nums article_item["comment_nums"] = comment_nums article_item["tag"] = tag article_item['content'] = content yield article_item
def parse_datail(self, response): title = response.xpath( "/html/body/div[1]/div[3]/div[1]/div[1]/h1/text()").extract()[0] date = response.xpath("/html/body/div[1]/div[3]/div[1]/div[2]/p/text()" ).extract()[0].strip().replace(" ·", "") url = str(response.url) item = JobboleItem() item["title"] = title item["date"] = date item["url"] = url yield item
def parse_detail(self, response): jobbole_item = JobboleItem() front_image_url = response.meta.get("front_image_url", "") item_loader = JobBoleArticleItemLoader(item=JobboleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse(self, response): sel = Selector(response).xpath('//div[@class="post floated-thumb"]') for content in sel: item = JobboleItem() item['title'] = content.xpath( 'div[2]/p[1]/a[1]/text()').extract_first() item['url'] = content.xpath( 'div[2]/p[1]/a[1]/@href').extract_first() if item['url']: yield Request(item['url'], callback=self.get_content, meta={'item': item}) yield item
def parse(self, response): jobs = response.xpath( '// *[ @ id = "archive"]/div[@class="post floated-thumb"]') for j in jobs: item = JobboleItem() item['title'] = j.xpath('./div[2]/p[1]/a[1]/text()').extract() yield item next_page = response.xpath( '//*[@id="archive"]/div[21]/a[4]/@href').extract()[0] #下一页链接 if next_page is not None: # 判断是否存在下一页 next_page = response.urljoin(next_page) yield scrapy.http.Request(next_page, callback=self.parse, dont_filter=True) #提交给parse继续抓取下一页
def jobbole_parse(self, response): date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()' ).getall()[0].strip() publish_time = re.sub('·', '', date) category = ''.join( response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()'). getall()[0]) origin_link = response.xpath( '//div[@class="copyright-area"]/a/@href').get() origin_author = response.xpath( '//div[@class="copyright-area"]/a/text()').get() content = response.xpath('////div[@class="entry"]').getall() title = response.xpath('//div[@class="entry-header"]/h1/text()').get() item = JobboleItem( publish_time=publish_time, category=category, origin_link=origin_link, title=title, origin_author=origin_author, content=content, ) yield item
def parseJobDetail(self, response): print(response.status) item = JobboleItem() item['title'] = response.xpath( '//div[@class="grid-8"]/div/div[@class="entry-header"]/h1/text()' )[0].extract() item['creation_time'] = response.xpath( './/p[@class="entry-meta-hide-on-mobile"]/text()')[0].extract() item['article_addresses'] = response.url item['image_links'] = response.xpath( '//*[@class="entry"]//img/@src').extract() item['praise_num'] = response.xpath( '//div[@class="post-adds"]/span[1]/h10/text()').extract() item['collect_num'] = response.xpath( '//div[@class="post-adds"]/span[2]/text()').extract() item['comment_num'] = response.xpath( '//span[@class="btn-bluet-bigger href-style hide-on-480"]/text()' ).extract() item['centent'] = response.xpath( "//div[@class='entry']//p/text()").extract() item['label'] = response.xpath( '//div[@class="entry-meta"]/p/a[3]/text()').extract() yield item
def parse_content(self,response): print('开始匹配') item = JobboleItem() # 标题 item['title'] = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # 创建时间 item['create_data'] = response.xpath('//div[@class="entry-meta"]/p/text()').extract()[0] # 文章地址 item['url'] = response.url # 图片链接地址 item['img_url'] = response.xpath('//img[@class="aligncenter"][1]/@src').extract_first('') # # 点赞数 item['praise_nums'] = response.xpath('//div[@class="post-adds"]/span[1]/h10/text()').extract()[0] # print(praise_nums) # 收藏数量 item['bookmark_nums'] = response.xpath('//span[@class=" btn-bluet-bigger href-style bookmark-btn register-user-only "]/text()').extract_first('0')[0] # 评论数量 item['comment_nums'] = response.xpath('//div[@class="post-adds"]/a/span/h10/text()').extract_first('0')[0] # 文章内容 item['content'] =response.xpath('//div[@class="entry"]//p/text()').extract() # print(content) # # 标签 item['tags'] =response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() yield item
def parse_detail(self, response): """提取文章内的字段""" article_item = JobboleItem() # ---------------------------------通过xpath选择器提取字段-------------------------------------- # title = response.xpath('//*[@id="post-114690"]/div[1]/h1/text()').extract_first() # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace( # '·', '').strip() # praise_number = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract_first() # # favorite_numbers = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract_first() # 返回'2 收藏' # match_re = re.match(r'.*(\d+).*', favorite_numbers) # if match_re: # favorite_numbers = int(match_re.group(1)) # else: # favorite_numbers = 0 # # comment_numbers = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first() # match_re = re.match(r'.*(\d+).*', comment_numbers) # if match_re: # comment_numbers = int(match_re.group(1)) # else: # comment_numbers = 0 # content = response.xpath('//div[@class="entry"]').extract_first() # # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # 获取文章标签 # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # 过滤掉评论 # tags = ','.join(tag_list) # 将list元素转换成str # ---------------------------------通过css选择器提取字段-------------------------------------- front_image_url = response.meta.get('front_image_url', '') # 文章封面图 title = response.css('.entry-header h1::text').extract_first() create_date = response.css('.entry-meta-hide-on-mobile::text' ).extract_first().strip().replace( '·', '').strip() praise_numbers = response.css( '.vote-post-up h10::text').extract_first() favorite_numbers = response.css('.bookmark-btn::text').extract_first() match_re = re.match(r'.*(\d+).*', favorite_numbers) if match_re: favorite_numbers = int(match_re.group(1)) else: favorite_numbers = 0 comment_numbers = response.css( 'a[href="#article-comment"] span::text').extract_first() match_re = re.match(r'.*(\d+).*', comment_numbers) if match_re: comment_numbers = int(match_re.group(1)) else: comment_numbers = 0 content = response.css('div.entry').extract() tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract() tag_list = [ element for element in tag_list if not element.strip().endswith("评论") ] # 过滤掉评论 tags = ','.join(tag_list) # 将list元素转换成str article_item['url_object_id'] = get_md5(response.url) # 将url改为md article_item['title'] = title article_item['create_date'] = create_date article_item['url'] = response.url article_item['front_image_url'] = [front_image_url ] # 图片下载地址scrapy接受的参数是list类型 article_item['praise_numbers'] = praise_numbers article_item['comment_numbers'] = comment_numbers article_item['favorite_numbers'] = favorite_numbers article_item['tags'] = tags article_item['content'] = content yield article_item
def parse_detail(self, response): ''' article_item = JobboleItem() # 提取文章的具体字段 # 文章图片 front_image_url = response.meta.get("front_image_url", "") #文章标题 title = response.css('.entry-header h1::text').extract_first() # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first() # 创建时间 create_time = response.css('.entry-meta-hide-on-mobile::text').extract_first().strip().replace(' ·', '') # create_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', '') # 点赞数量 vote_nums = response.css('.vote-post-up h10::text').extract_first() # vote_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract_first() if vote_nums: vote_nums = int(vote_nums) else: vote_nums = 0 # 收藏数量 mark_nums = response.css('.bookmark-btn::text').extract_first() # mark_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract_first() match_re = re.match('.*?(\d+).*?', mark_nums) if match_re: mark_nums = int(match_re.group(1)) else: mark_nums = 0 # 评论数量 comment_nums = response.css('.btn-bluet-bigger.href-style.hide-on-480::text').extract_first() # comment_nums = response.xpath('//span[@class="btn-bluet-bigger href-style hide-on-480"]/text()').extract_first() match_re = re.match('.*?(\d+).*?', comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 # 文章内容,这里只提取HTML connent = response.css('.entry').extract_first() # content = response.xpath('//div[@class="entry"]').extract_first() # 标签 tags_list = response.css('.entry-meta-hide-on-mobile a::text').extract() # tags = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() tags_list = [element for element in tags_list if not element.strip().endswith("评论")] tags = ','.join(tags_list) article_item["title"] = title article_item["url"] = response.url try: create_time = datetime.datetime.strptime(create_time, '%Y/%m/%s').date() except Exception as e: create_time = datetime.datetime.now().date() article_item["create_time"] = create_time article_item["vote_nums"] = vote_nums article_item["mark_nums"] = mark_nums article_item["comment_nums"] = comment_nums article_item["content"] = connent article_item["tags"] = tags article_item["front_image_url"] = [front_image_url] article_item["url_object_id"] = get_md5(response.url) ''' # 通过itemloader加载item front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=JobboleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") # item_loader.add_xpath() item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_css('create_time', '.entry-meta-hide-on-mobile::text') item_loader.add_css('vote_nums', '.vote-post-up h10::text') item_loader.add_css('mark_nums', '.bookmark-btn::text') item_loader.add_css('comment_nums', '.btn-bluet-bigger.href-style.hide-on-480::text') item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text') item_loader.add_css('content', '.entry') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobboleItem() # 提取文章的具体字段 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip() # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # # content = response.xpath("//div[@class='entry']").extract()[0] # # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # 通过css选择器提取字段 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # # 把字符串日期转换为date对象 # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = int(praise_nums) # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content # print(article_item) # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # ArticleItemLoader是自定义的itemloader,只取数组第一个 item_loader = ArticleItemLoader(item=JobboleItem(), response=response) # item_loader = ItemLoader(item=JobboleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item