def parse(self, response): res_json = json.loads(response.text)[0]["data"] item_loader = ArticleItemLoader(item=PDFItem(), response=response) url_list = [ "http://reportdocs.static.szse.cn/UpFiles/fxklwxhj/CDD00079356200.pdf" ] item_loader.add_value("file_urls", url_list) pdf_item = item_loader.load_item() yield pdf_item
def parse(self, response): res_json = json.loads(response.text)[2]["data"] for data in res_json: item_loader = ArticleItemLoader(item=SZSEItem(), response=response) item_loader.add_value("gsdm", data["gsdm"]) item_loader.add_value("gsjc", data["gsjc"]) item_loader.add_value("fhrq", data["fhrq"]) item_loader.add_value("hjlb", data["hjlb"]) # item_loader.add_value("ck_url", data["ck"]) url_list = [] url_list.append("http://reportdocs.static.szse.cn" + re.match(self.ck_rule, data["ck"]).group(1)) item_loader.add_value("file_urls", url_list) szse_item = item_loader.load_item() yield szse_item
def parse_detail(self, response): front_end_url = response.meta["front_end_url"] # title = response.xpath("//div[@class='post']/div[@class='article']/h1[@class='title']/text()").extract()[0] # print(title) # print(get_md5(response.url)) # jianshu_item = JianShuArticlespiderItem() # # jianshu_item["url"] = response.url # jianshu_item["title"] = title # jianshu_item['front_image_url'] = [front_end_url] item = ArticleItemLoader(item=JianShuArticlespiderItem(), response=response) item.add_xpath("title", "//div[@class='post']/div[@class='article']/h1[@class='title']/text()") item.add_value("url", response.url) item.add_value("front_image_url", [front_end_url]) jianshu = item.load_item() yield jianshu
def parse_detail(self, response): person_nodes = response.css(".ge2_content tr") for person_node in person_nodes: person = person_node.css("td::text").extract() item_loader = ArticleItemLoader(item=GZZBItem(), response=response) if person: item_loader.add_value("code", person[0]) item_loader.add_value("name", person[1]) item_loader.add_value("date", "2017-05") gzzb_item = item_loader.load_item() yield gzzb_item
def parse_detail(self, response): fang_item = FangItem() item_loader = ArticleItemLoader(item=FangItem(), response=response) # item_loader.add_css("name", ".zf_mftel") # item_loader.add_css("phone", ".zf_mftel") fang_item['name'] = response.css(".zf_mfname::text").extract_first() fang_item['phone'] = response.css(".zf_mftel::text").extract_first() addressArr = response.css(".link-under::text").extract() fang_item['address'] = addressArr[1] + " - " + addressArr[2] + " - " + addressArr[0] yield fang_item
def parse_detail(self, response): article_item = JobBoleAricleItem() # 使用css提取文章具体字段 # title = response.css('.entry-header h1::text').extract()[0] create_date = response.css('.entry-meta-hide-on-mobile::text').extract( )[0].strip().split(' ')[0] # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d") # except Exception as e: # create_date = datetime.datetime.now().date() # response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract() praise_nums = response.css('.vote-post-up h10 ::text').extract()[0] image_url = response.css("img::attr(src)").extract_first("") # fav_nums = response.css('.bookmark-btn::text').extract()[0] # pattern = '.*?(\d+).*$' # match_re = re.match(pattern, fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # content = response.xpath('//div[@class="entry"]').extract_first() # tar_list = response.css('p.entry-meta-hide-on-mobile>a::text').extract() # tags = [element for element in tar_list if not element.strip().endswith('评论')] # article_item["title"] = title # article_item["url"] = response.url # article_item["create_date"] = create_date # article_item["praise_nums"] = praise_nums # article_item["front_img_url"] = [front_img_url] # article_item["content"] = content # article_item["tags"] = tags # article_item["fav_nums"] = fav_nums # article_item["url_object_id"] = get_md5(response.url) # 通过itemloaader 加载item front_img_url = response.meta.get("front_img_url", "") item_loader = ArticleItemLoader(item=JobBoleAricleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_img_url", [front_img_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # 实例化JobBoleArticleItem article_item = JobBoleArticleItem() ''' # re_selector = response.xpath("/html/body/div[3]/div[3]/div[1]/div[1]/h1") # re2_selector = response.xpath('//*[@id="post-112048"]/div[1]/h1/text()') title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # 获取时间 create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "").strip() # 点赞数 praise_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0] # 收藏数 fav_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = match_re.group(1) # 评论数 comment_nums = response.xpath('//a[@href="#article-comment"]/span').extract()[0] match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) #没有评论就设置默认值为0 else: common_nums=0 # 正文内容 content = response.xpath('//div[@class="entry"]').extract()[0] tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list) ''' ''' python通过css选择器来提取元素 ''' # 通过css选择器来获取标题。 front_image_url = response.meta["front_image_url"] # 文章封面图 # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "").strip() # praise_nums = response.css("span.vote-post-up h10::text").extract()[0] # fav_nums = response.css("span.bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css("div.entry").extract()[0] # tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # # 填充值 # article_item["title"] = title # article_item["url"] = response.url # # # 日期转换 # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # # MD5压缩赋值 # article_item["url_object_id"] = get_md5(response.url) # 通过itemloader来加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") # item_loader.add_xpath() 暂时不用 item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", "span.vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", "span.bookmark-btn::text") item_loader.add_css("tags", ".entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # 解析填充的规则 article_item = item_loader.load_item() # 异步调用 yield article_item
def parse_detail(self, response): # article_item = JobBoleArticleItem() item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) front_image_url = response.meta.get("front_image_url", "") item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self,response): # #提取文章的具体字段 # #/html/body/div[3]/div[3]/div[1]/div[1](div[3]的下标从1开始) front_image_url=response.meta.get('format_image','')#文章封面图 # title=response.xpath('//*[@class="entry-header"]/h1/text()').extract()#id是唯一表示 # post_time=response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace('·',' ').strip()#extract()表示把它变为一个列表访问下表为‘0’的元素,strip()是去掉空格换行,replace('需要替换的内容',‘替换成的内容’) # praise_nums=int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0])#contains()函数表示span标签中的class属性的内容包含vote-post-up # fav_nums=response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0] # result=re.match('.*(\d+).*',fav_nums) # if result: # fav_nums=int(result.group(1)) # else : # fav_nums=0 # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0] # result = re.match('.*(\d+).*',comment_nums) # if result: # comment_nums = int(result.group(1)) # else: # comment_nums=0 # context=response.xpath('//div[@class="entry"]').extract()[0] # tag_list=response.xpath('//*[@id="post-114253"]/div[2]/p/a/text()').extract() # tag_list=[element for element in tag_list if not element.strip().endswith('Git')]#去掉以前获取过的元素Git重复(不以Git为结尾的过滤掉)列表生成器 # #列表生成器过程:1.定义elemet 2.判断if not 结果为真时赋值给element,3.element添加列表中 # tags=','.join(tag_list) # #往item当中填值 # ''' title=scrapy.Field() # post_time=scrapy.Field() # url=scrapy.Field() # url_object_id=scrapy.Field() # front_image_url=scrapy.Field() # front_image_path=scrapy.Field() # praise_nums=scrapy.Field() # comment_nums=scrapy.Field() # fav_nums=scrapy.Field() # tags=scrapy.Field() # content=scrapy.Field()''' # article = ArticleItem() # article['url_object_id']=get_md5(response.url) # article['title']=title # try: # post_time=datetime.datetime.strptime(post_time,'%Y/%m/%d').date() # except Exception as e: # post_time=datetime.datetime.now() # article['post_time']=post_time # article['url']=response.url # article['front_image_url']=[front_image_url]#下载图片的路径传递的是数组 # article['praise_nums']=praise_nums # article['fav_nums']=fav_nums # article['comment_nums']=comment_nums # article['tags']=tags # article['content']=context # yield article#调用yield article会传递到pipelines.py 中 # #通过css选择器提取字段 # # title1=response.css('.entry-header h1::text').extract()[0] # # post_time1=response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·',' ').strip() # # praise_nums1=int(response.css('.vote-post-up h10::text').extract()[0]) # # fav_nums=response.css('.bookmark-btn::text').extract()[0] # # result = re.match('.*?(\d+).*', fav_nums) # # if result: # # fav_nums = int(result.group(1)) # # else: # # fav_nums=0 # # comment_nums=response.css('a[href="#article-comment"] span::text').extract()[0] # # result = re.match('.*?(\d+).*', comment_nums) # # if result: # # comment_nums = int(result.group(1)) # # else: # # comment_nums=0 #通过item Loader加载item front_image_url = response.meta.get('format_image', '') # 文章封面图 item_loader=ArticleItemLoader(item=ArticleItem(),response=response)#ArticleItemLoader是自定义的itemloader,他是继承itemloader,是在item.py文件重写的 item_loader.add_css('title','.entry-header h1::text') item_loader.add_css('post_time','p.entry-meta-hide-on-mobile::text') item_loader.add_value('url',response.url) item_loader.add_value('url_object_id',get_md5(response.url)) item_loader.add_value('front_image_url',[front_image_url]) item_loader.add_css('praise_nums','.vote-post-up h10::text') item_loader.add_css('fav_nums','.bookmark-btn::text') item_loader.add_css('comment_nums','a[href="#article-comment"] span::text') item_loader.add_css('tags','p.entry-meta-hide-on-mobile a::text') item_loader.add_css('content','div.entry') article=item_loader.load_item()#利用item loader生成item yield article
def parse_detail(self, response): #提取文章的具体字段 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip() # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # # content = response.xpath("//div[@class='entry']").extract()[0] # # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) #通过css选择器提取字段 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content article_item = JobBoleArticleItem() #通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobboleArticleItem() # 提取文章的具体字段 # 通过xpath提取文章具体字段 """ title = response.xpath('//*[@id="post-112051"]/div[1]/h1/text()').extract_first("") #extract_first("")提取不到,返回为空 create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace(" ·","") praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] match_re = re.match(r".*?(\d+).*", fav_nums) if match_re : fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first("") match_re = re.match(r".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 #没有评论 content = response.xpath("//div[@class='entry']").extract()[0] tags = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() tag_list = [elment for elment in tags if not elment.strip().endswith("评论")] tags = ",".join(tag_list) print(title, create_date, praise_nums, comment_nums, tags) """ """ # 通过css选择器提取字段 front_image_url = response.meta.get("front_image_url", "") # 文章封面图 title = response.css(".entry-header h1::text").extract_first() create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace(" ·","") praise_nums = response.css(".vote-post-up h10::text").extract()[0] fav_nums = response.css("span.bookmark-btn::text").extract()[0] match_re = re.match(r".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] match_re = re.match(r".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 # 没有评论 content = response.css("div.entry").extract()[0] tags = response.css("p.entry-meta-hide-on-mobile a::text").extract() tag_list = [elment for elment in tags if not elment.strip().endswith("评论")] tags = ",".join(tag_list) print(title, create_date, praise_nums, comment_nums, tags) # 给Item填充值 article_item["title"] = title try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as error: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["url"] = response.url article_item["url_object_id"] = get_md5(response.url) article_item["front_image_url"] = [front_image_url] # 改为数组 article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content """ """ 通过item Loader来加载Item ----> 在以后的开发中都是用ItemLoader来解析值 """ front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", "span.bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() """ 传递到pipeline类里面去 """ yield article_item
def parse_detail(self, response): # 实例化JobBoleArticleItem article_item = JobBoleArticleItem() # 提取文章的具体字段 # use CSS Selector to locate Element # 获取文章封面图 # front_image_url = response.meta.get("front_image_url", "") # # # get title # title = response.css(".entry-header h1::text").extract()[0] # CSS伪类选择器:: # # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].replace("·", "").strip() # 处理/r/n空格,处理点号,处理空格 # # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # ' 2 收藏' # # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(r".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # ' 2 评论' # match_re = re.match(r".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # # # tag = response.css("p.entry-meta-hide-on-mobile a::text").extract()[0] # '开发' # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # ['开发', ' 2 评论 ', '数据科学', '机器学习'] # # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # '开发,数据科学,机器学习' # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # in items.py # article_item["url"] = response.url # # # need to convert create_date str to date # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # # article_item["create_date"] = create_date # # article_item["front_image_url"] = [front_image_url] # [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # 加载自定义item不能继承自ItemLoader,而是继承ArticleItemLoader item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # item_loader.add_xpath() item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # call item article_item = item_loader.load_item() # call yield , article_item will transfer to pipelines yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # 通过 css选择器 提取字段 # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css(".entry-meta-hide-on-mobile::text").extract()[0].strip().replace(' ·', '') # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css(".post-adds span:nth-child(2)::text").extract()[0] # match_re = re.match(r".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # # 如果没有匹配到数字 给一个默认值 # fav_nums = 0 # comment_nums = response.css(".post-adds a span::text").extract_first() # match_re = re.match(r".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css(".hentry").extract()[0] # tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract() # tag_list = [ele for ele in tag_list if not ele.strip().endswith("评论")] # tags = ','.join(tag_list) # 获取图片信息 # 取传入信息的默认值 防止未取到 设置默认 ‘’ # 文章封面图 # front_image_url = response.meta.get('front_image_url', '') # 此处下标从1开始 # /html/body/div[1]/div[3]/div[1]/h1 # // *[ @ id = "post-95521"] / div[1] / h1 # re1_selector = response.xpath("/html/head/title") # re2_selector = response.xpath('//*[@id="post-95521"]/div[1]/h1') # re3_selector = response.xpath('//*[@class="entry-header"]/h1/text()') # 通过 xpath 提取字段 # title = response.xpath('//*[@class="entry-header"]/h1/text()') # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', '') # praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract()[0] # # fav_nums = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract()[0] # match_re = re.match(r".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # # comment_nums = response.xpath('//a[contains(@href, "#article-comment")]/span/text()').extract()[0] # match_re = re.match(r".*?(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # # content = response.xpath('//div[contains(@class, "hentry")]').extract()[0] # # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_list = [ele for ele in tag_list if not ele.strip().endswith("评论")] # tag = ','.join(tag_list) # article_item["title"] = title # article_item["url"] = response.url # article_item["url_object_id"] = get_md5(response.url) # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["praise_nums"] = praise_nums # article_item["fav_nums"] = fav_nums # article_item["comment_nums"] = comment_nums # article_item["tags"] = tags # article_item["content"] = content # article_item["front_image_url"] = [front_image_url] # 通过 item_loader 加载 item 便于项目后期维护 # 用 scrapy 提供的 loader # item_loader = ItemLoader(item=JobBoleArticleItem(), response=response) # 用自定义的 loader item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_css('create_date', ".entry-meta-hide-on-mobile::text") item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('fav_nums', '.post-adds span:nth-child(2)::text') item_loader.add_css('comment_nums', '.post-adds a span::text') item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text') item_loader.add_css('content', '.hentry') item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", response.meta.get('front_image_url', '')) # item_loader.add_xpath() # 默认取出的所有 item 对象都是一个 list article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # 提取文章的具体字段,也属于回调函数 '''#通过XPATH提取字段 #获取标题 response.xpath('//*[@id="post-110287"]/div[1]/h1/text()').extract_first() #获取时间 createdate = re.sub('\r|\n| |·', '', response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0]) #获取点赞数 thumbs = int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0]) #获取收藏数 bookmark = int(response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0].split()[0]) #获取评论数 comments = int(response.xpath("//div[@class='post-adds']/a/span/text()").extract()[0].split()[0]) #获取正文 contents = response.xpath('//div[@class="entry"]').extract() #获取career career = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a[1]/text()").extract()[0],\ response.xpath("//p[@class='entry-meta-hide-on-mobile']/a[3]/text()").extract()[0] career2 = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() career3 = [element for element in career2 if not element.strip().endswith("评论")] tags = ','.join(career3) #获取作者 author = response.xpath("//a[@href='http://www.jianshu.com/p/dc859753a035']/text()").extract_first() ''' ''' # 通过CSS选择器提取字段 # 获取标题左侧图片,文章封面 front_image_url = response.meta.get("front_image_url", "") # 获取标题 title = response.css(".entry-header h1::text").extract()[0] # 获取时间 createdate2 = re.sub('\r|\n| |·', '', response.css(".entry-meta-hide-on-mobile::text").extract()[0]) try: createdate = datetime.strptime(createdate2, '%Y/%m/%d').date() except Exception as e: createdate = datetime.now().date() # 拿到url的md5值 url_object_id = common.get_md5(response.url) # 获取点赞数 m3 = re.search(r'\d+', response.css("span[class*='vote-post-up'] h10::text").extract_first()) if m3: thumbs = int(m3.group(0)) else: thumbs = None # 获取收藏数 m1 = re.search(r'\d+', response.css("span[class*='bookmark-btn']::text").extract_first()) if m1: bookmark = int(m1.group(0)) else: bookmark = None # 获取评论数 m2 = re.search(r'\d+', response.css("div.post-adds a span::text").extract_first()) if m2: comments = int(m2.group(0)) else: comments = None # 获取正文 contents = response.css("div.entry").extract() # 获取career career = response.css("p.entry-meta-hide-on-mobile a::text").extract() career2 = [element for element in career if not element.strip().endswith("评论")] tags = ','.join(career2) # 获取作者 author = response.css(".copyright-area > a::text").extract_first() # author = response.css(".copyright-area a:nth-child(1)::text").extract_first() article_item["title"] = title article_item["createdate"] = createdate article_item["url"] = response.url article_item["url_object_id"] = url_object_id article_item["front_image_url"] = [front_image_url] article_item["thumbs"] = thumbs article_item["bookmark"] = bookmark article_item["comments"] = comments article_item["contents"] = contents article_item["tags"] = tags article_item["author"] = author yield article_item ''' # 通过itemloader加载item front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # item_loader.add_css(),item_loader.add_value(),item_loader_add_xpath() # item_loader会自动做.extract()方法 item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("createdate", ".entry-meta-hide-on-mobile::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", common.get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("thumbs", "span[class*='vote-post-up'] h10::text") item_loader.add_css("bookmark", "span[class*='bookmark-btn']::text") item_loader.add_css("comments", "div.post-adds a span::text") item_loader.add_css("contents", "div.entry") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("author", ".copyright-area > a::text") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobboleArticleItem() # 提取文章的具体字段 # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css(".entry-meta-hide-on-mobile::text").extract()[0].replace("·", "").strip() # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(".*(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # article_item["front_image_url"] = [front_image_url] # try: # create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["praise_nums"] = praise_nums # article_item["fav_nums"] = fav_nums # article_item["comment_nums"] = comment_nums # article_item["content"] = content # article_item["tags"] = tags # 通过item loader 加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text') item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('fav_nums', '.bookmark-btn::text ') item_loader.add_css('comment_nums', "a[href='#article-comment'] span::text") item_loader.add_css('content', 'div.entry') item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text') item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('url', response.url) item_loader.add_value('front_image_url', [front_image_url]) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() #提取文章的具体字段 front_image_url = response.meta.get('front_image_url', '') # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace("·", "").strip() # praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract()[0] # # fav_nums = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract()[0] # match_re = re.match('.*?(\d+).*', fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0] # match_re = re.match('.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.xpath('//div[@class="entry"]').extract() # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content #通过item_loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_xpath( 'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()') item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_xpath( 'praise_nums', '//span[contains(@class, "vote-post-up")]/h10/text()') item_loader.add_xpath('comment_nums', '//a[@href="#article-comment"]/span/text()') item_loader.add_xpath( 'fav_nums', '//span[contains(@class, "bookmark-btn")]/text()') item_loader.add_xpath( 'tags', '//p[@class="entry-meta-hide-on-mobile"]/a/text()') item_loader.add_xpath('content', '//div[@class="entry"]') article_item = item_loader.load_item() yield article_item
def parse_detail(self,response): # article_item = JobBoleArticleItem() # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title = response.css('div.entry-header h1::text').extract_first() # create_data = response.css('p.entry-meta-hide-on-mobile::text').extract_first().strip().replace("·","").strip() # praise_nums = response.css('span.vote-post-up h10::text').extract_first() # fav_nums = response.css(".bookmark-btn::text").extract_first() # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first() # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css('div.entry').extract_first() # tag_list = response.css('p.entry-meta-hide-on-mobile a::text').extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_data = datetime.datetime.strptime(create_data, "%Y/%m/%d").date() # except Exception as e: # create_data = datetime.datetime.now().date() # article_item['create_date'] = create_data # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # 通过css选择器将后面的指定规则进行解析。 item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # 调用这个方法来对规则进行解析生成item对象 article_item = item_loader.load_item() # 已经填充好了值调用yield传输至pipeline yield article_item
def parse_detail(self, response): # re_selector = response.xpath('//*[@id="post-113771"]/div[1]/h1/text()') # article_item = JobBoleArticleItem() # # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # page_url = response.meta.get("page_url", "") # 文章所在页面的url # # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace("·", # "").strip() # # praise_nums = response.xpath( # '//span[@class=" btn-bluet-bigger href-style vote-post-up register-user-only "]/h10/text()').extract()[ # 0].strip() # fav_nums = response.xpath( # '//span[@class=" btn-bluet-bigger href-style bookmark-btn register-user-only "]/text()').extract()[0] # # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # else: # fav_nums = 0 # # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0] # if match_re: # comment_nums = match_re.group(1) # else: # comment_nums = 0 # # content = response.xpath("//div[@class='entry']/text()").extract()[0] # # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # article_item["page_url"] = page_url # 通过Item Loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 page_url = response.meta.get("page_url", "") # 文章所在页面的url item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # item_loader.add_css() # item_loader.add_value() item_loader.add_xpath("title", '//div[@class="entry-header"]/h1/text()') item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_xpath("create_date", '//p[@class="entry-meta-hide-on-mobile"]/text()') item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("page_url", "-") item_loader.add_xpath("comment_nums", '//a[@href="#article-comment"]/span/text()') item_loader.add_xpath("praise_nums", '//span[@class=" btn-bluet-bigger href-style vote-post-up register-user-only "]/h10/text()') item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loader.add_xpath("content", '//div[@class="entry"]/text()') article_item = item_loader.load_item() yield article_item
def parse_details(self, response): metas = response.meta print(metas) # 实例化iterm article_iterm = JobBoleArticleIterm() #parse_details函数用于提取文章的具体字段 # # title = response.xpath('//*[@id="post-110287"]/div[1]/h1') # title = response.css('.entry-header h1::text').extract_first() # # # published_date_selector = response.xpath('//*[@id="post-110287"]/div[2]/p/text()[1]') # published_date_selector = response.css('.entry-meta-hide-on-mobile::text').extract_first() # published_date = re.match('\\s+.*?(\d.*\d).*', published_date_selector).group(1) # # # like_nums = response.xpath('//*[@id="110287votetotal"]/text()').extract_first() # # like_nums = response.css('.post-adds h10::text').extract_first() # like_nums = response.css('.vote-post-up h10::text').extract_first() # match_like_nums = re.match('.*?(\d+).*', like_nums) # if match_like_nums: # like_nums = int(match_like_nums.group(1)) # else: # like_nums = 0 # # # collect_nums = response.xpath('//*[@id="post-110287"]/div[3]/div[9]/span[2]/text()').extract_first() # collect_nums = response.css('.bookmark-btn::text').extract_first() # match_collect_nums = re.match('.*?(\d+).*', collect_nums) # if match_collect_nums: # collect_nums = int(match_collect_nums.group(1)) # else: # collect_nums = 0 # # # comment_nums = response.xpath('//*[@id="post-110287"]/div[3]/div[9]/a/span/text()').extract_first() # comment_nums = response.css('.post-adds a span::text').extract_first() # match_comment_nums = re.match('.*?(\d+).*', comment_nums) # if match_comment_nums: # comment_nums = int(match_comment_nums.group(1)) # else: # comment_nums = 0 # # # content = response.xpath('//*[@id="post-110287"]/div[3]').extract_first() # content = response.css('.entry').extract_first() # # tags_list = response.css('.entry-meta-hide-on-mobile a::text').extract() # tags_list = [i for i in tags_list if not i.strip().endswith('评论')] # #或 tags_list = [i for i in tags_list if not re.match('.*评论.*', i)] # tags = ','.join(tags_list) # # article_iterm['title'] = title # try: # published_date = datetime.datetime.strptime(published_date, '%Y/%m/%d').date() # except Exception as e: # published_date = datetime.datetime.strptime('1994/09/21', '%Y/%m/%d').date() # article_iterm['published_date'] = published_date # article_iterm['article_url'] = article_url # article_iterm['article_url_id'] = article_url_id # article_iterm['cover_image_url'] = [cover_img_url] # article_iterm['like_nums'] = like_nums # article_iterm['collect_nums'] = collect_nums # article_iterm['comment_nums'] = comment_nums # article_iterm['content'] = content # article_iterm['tags'] = tags # 通过item loader加载item article_url = response.url article_url_id = strURL_to_md5(article_url) cover_img_url = response.meta.get( 'cover_img_url', '') # 文章封面, meta 字典类型,这里用了get方法,默认如果是空,则为'',不会报错 iterm_loader = ArticleItemLoader(item=JobBoleArticleIterm(), response=response) # 实例化一个ItemLoader iterm_loader.add_css('title', '.entry-header h1::text') iterm_loader.add_value('article_url', article_url) iterm_loader.add_value('article_url_id', article_url_id) iterm_loader.add_css('published_date', '.entry-meta-hide-on-mobile::text') iterm_loader.add_value('cover_image_url', [cover_img_url]) iterm_loader.add_css('like_nums', '.vote-post-up h10::text') iterm_loader.add_css('collect_nums', '.bookmark-btn::text') iterm_loader.add_css('comment_nums', '.post-adds a span::text') iterm_loader.add_css('content', '.entry') iterm_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text') article_iterm = iterm_loader.load_item() yield article_iterm
def parse_detail(self, response): article_item = JobBoleArticleItem() ## 提取文章的具体字段 ## xpath的提取元素的用法 # re_selector = response.xpath('//*[@id="post-114000"]/div[1]/h1/text()') # re_selector = response.xpath('//*[@id="post-114000"]/div[1]/h1') # re_selector2 = response.xpath('//div[@class="entry-header"]/h1/text()') # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # datetime = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip() # like_num = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]) # # collect_text = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # match_var = re.match(r'.*?(\d+).*', collect_text) # if match_var: # collect_num = match_var.group(1) # # comment_text = response.xpath("//a[@href='#article-comment']/span[1]/text()").extract()[0] # match_var = re.match(r'.*?(\d+).*', comment_text) # if match_var: # comment_num = match_var.group(1) # # context = response.xpath("//div[@class='entry']").extract()[0] ## css选择器提取元素的用法 # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title_css = response.css(".entry-header h1::text").extract()[0] # datetime_css = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip() # like_num_css = int(response.css(".vote-post-up h10::text").extract()[0]) # # collect_text_css = response.css(".bookmark-btn::text").extract()[0] # match_var = re.match(r'.*?(\d+).*', collect_text_css) # if match_var: # collect_num_css = match_var.group(1) # # comment_text_css = response.css("a[href='#article-comment'] span::text").extract()[0] # match_var = re.match(r'.*?(\d+).*', comment_text_css) # if match_var: # comment_num_css = match_var.group(1) # # context_css = response.css(".entry").extract()[0] # # article_item["title"] = title_css # # datetime_css = "2018/05/12" # try: # create_date = datetime.strptime(datetime_css, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.now().date() # article_item["datetime"] = create_date # article_item["like_num"] = like_num_css # article_item["url"] = response.url # article_item["url_object_id"] = get_md5(response.url) # article_item["front_image_url"] = [front_image_url] # article_item["collect_num"] = collect_num_css # article_item["context"] = context_css # 通过item Loader 来加载 item # 首先定义itemLoader 的实例 # ItemLoader()函数有2个重要的参数 # 第一个参数 item对应于 items.py中的JobBoleArticleItem() # 第二个参数是 response # 为了能够自动取list的第一个元素,我们自定义了ArticleItemLoader 继承 ItemLoader item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) #item_loader.add_css() 是通过网页中的样式进行提取,为 item_loader添加规则 # 用add_css()方法的好处是 其代码较为清晰,并且参数css的样式可以通过对数据库查找动态的加载,而 # 不是硬编码到程序中,方便进行动态配置 # 第一个参数为item中 选项值 # 第二个参数为css的样式 item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("context", ".entry") item_loader.add_css("datetime", "p.entry-meta-hide-on-mobile::text") item_loader.add_css("like_num", ".vote-post-up h10::text") item_loader.add_css("collect_num", ".bookmark-btn::text") item_loader.add_css("context", ".entry") front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # 如果不是通过css的样式取出对应的值,则使用 item_loader.add_value item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) # 调用 load_item() 方法,才会将 前面的规则进行解析 article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # article_item = JobboleArticleItem() # 提取文章的具体字段 # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', '') # fav_nums = response.xpath('//div[@class="post-adds"]/span[2]/h10/text()').extract_first() # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first() # content = response.xpath('//div[@class="entry"]').extract()[0] # tag_list = response.xpath('//div[@class="entry-meta"]/p/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content # 通过ItemLoader加载item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_xpath( 'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()') front_image_url = response.meta.get("front_image_url", "") item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_xpath('comment_nums', '//a[@href="#article-comment"]/span/text()') fav_nums = response.xpath( '//div[@class="post-adds"]/span[2]/h10/text()').extract_first() if fav_nums is None: fav_nums = '0' item_loader.add_value('fav_nums', fav_nums) item_loader.add_xpath('tags', '//div[@class="entry-meta"]/p/a/text()') item_loader.add_xpath('content', '//div[@class="entry"]') article_item = item_loader.load_item() yield article_item
def parse_question(self, response): # 处理question页面, 从页面中提取出具体的question item question_id = response.meta.get('question_id', '') item_loader = ArticleItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment span::text") item_loader.add_css( "watch_user_num", ".NumberBoard:first-child .NumberBoard-itemValue::text") item_loader.add_css( "click_num", ".NumberBoard:last-child .NumberBoard-itemValue::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() # 请求答案api answer_url = self.start_answer_url.format(question_id, 5, 0) yield scrapy.Request(answer_url, headers=self.headers, callback=self.parse_answer) yield question_item
def parse_detail(self, response): sohuItem = SohuItem() front_image_url = response.meta.get("front_image_url", "") article_type = response.meta.get("article_type", "") item_loader = ArticleItemLoader(item=sohuItem, response=response) item_loader.add_css("title", ".text-title h1::text") item_loader.add_value("url", response.url) item_loader.add_value("front_image_url", front_image_url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("article_type", article_type) item_loader.add_css("author_name", ".user-info h4 a::text") item_loader.add_css("publish_time", ".article-info span::text") item_loader.add_css("content", "article") item_loader.add_value("crawl_time", datetime.now()) sohuItem = item_loader.load_item() yield sohuItem
def parse_detail(self, response): # 实例化item article_item = JobBoleArticleItem() # jobbole.py 解析字段,使用选择器 # 首先需要实例化一个ItemLoader类的对象 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # 实例化一个对象 """有三种重要的方法 item_loader.add_css() # 通过css选择器选择的 item_loader.add_xpath() item_loader.add_value() # 不是选择器选择的,而是直接填充 """ # 通过item loader加载item # 获得meta中的front_image_url,文章封面图 front_image_url = response.meta.get("front_image_url", "") item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("content", "div.entry") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") # 获取article_item article_item = item_loader.load_item() """ 调用默认的load_item()方法有两个问题,第一个问题会将所有的值变成一个list,虽然听起来不合理,但是从另外的角度来看,也是合理的 因为通过css选择器取出来的极有可能就是一个list,不管是取第0个还是第1个,都是一个list,所以默认情况就是list 如何解决问题呢,list里面只取第一个,以及对某个字段的list加一些额外的处理过程 在item.py对字段进行定义,scrapy.Field()里面是有参数的,input_processor表示对输入的值预处理过程,后面MapCompose()类中可以传递很多函数名的参数,表示从左到右依次处理 title = scrapy.Field( input_processor = MapCompose(add_jobbole) ) """ yield article_item # 将item传递到pipeline中
def parse_defail(self, response): # 提取文章的具体字段 # 通过item_loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', [response.meta.get('front_image_url', '')]) item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('comment_nums', 'a[href="#article-comment"] span::text') item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text') item_loader.add_css('content', 'div.entry') item_loader.add_css('author', 'div.copyright-area') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # article_item = JobBoleArticleItem() #提取文章的具体字段 # front_image_url = response.meta.get("front-img-url", "") #文章封面图url # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first(default="") # creat_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().strip().replace("·","").strip() # praise_num = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract_first() # collect_num = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first() # match_re = re.match(".*?(\d+).*", collect_num) # if match_re: # collect_num = int(match_re.group(1)) # else: # collect_num = 0 # # comment_num = response.css(".btn-bluet-bigger.href-style.hide-on-480::text").extract_first() # match_re = re.match(".*?(\d+).*", comment_num) # if match_re: # comment_num = int(match_re.group(1)) # else: # comment_num = 0 # # content = response.xpath("//div[@class='entry']").extract_first() # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # article_item["front_image_url"] = [front_image_url] # try: # creat_date = datetime.strptime(creat_date, "%Y/%m/%d").date() # except Exception as e: # creat_date = datetime.now().date() # article_item["creat_date"] = creat_date # article_item["praise_num"] = praise_num # article_item["collect_num"] = collect_num # article_item["comment_num"] = comment_num # article_item["content"] = content # article_item["tags"] = tags # 通过ItemLoder加载item front_image_url = response.meta.get("front-img-url", "") # 文章封面图url item_loder = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loder.add_xpath("title", "//div[@class='entry-header']/h1/text()") item_loder.add_xpath("creat_date", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loder.add_xpath( "praise_num", "//span[contains(@class, 'vote-post-up')]/h10/text()") item_loder.add_xpath( "collect_num", "//span[contains(@class, 'bookmark-btn')]/text()") item_loder.add_css("comment_num", ".btn-bluet-bigger.href-style.hide-on-480::text") item_loder.add_xpath("content", "//div[@class='entry']") item_loder.add_xpath( "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loder.add_value("url", response.url) item_loder.add_value("url_object_id", get_md5(response.url)) item_loder.add_value("front_image_url", [front_image_url]) article_item = item_loder.load_item() #传到pipeline yield article_item
def parse_detail(self, response): # 提取文章的具体字段 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first() # # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().replace("·", "").strip() # try: # create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d') # except Exception as e: # create_date = datetime.datetime.now() # # praise_nums = int(response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first()) # # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract_first() # match_re = re.match(r".*(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first() # match_re = re.match(r".*(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # # content = response.xpath("//div[@class='entry']").extract_first() # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item = JobboleArticleItem() # article_item['url_object_id'] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # 通过ItemLoader加载item front_image_url = response.meta.get("front_image_url", "") #文章封面图 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_xpath('title', "//div[@class='entry-header']/h1/text()") item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_xpath( 'create_date', "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_xpath( 'praise_nums', "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath('comment_nums', "//a[@href='#article-comment']/span/text()") item_loader.add_xpath( 'fav_nums', "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath( 'tags', "//p[@class='entry-meta-hide-on-mobile']/a/text()") # item_loader.add_xpath('content', "//div[@class='entry']") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): """ 提取文章 """ article_item = JobBoleAritcleItem() # title = response.xpath('//*[@id="post-112499"]/div[1]/h1/text()').extract()[0] # date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "") # priase_nums = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]) # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # content = response.xpath("//div[@class='entry']").extract()[0] # author = response.xpath("//a[@href='http://www.jobbole.com/members/hanxiaomax']/text()").extract_first("") # pass # title = response.css(".entry-header h1::text").extract()[0] # date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "") # priase_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css("span.bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css("span.hide-on-480::text").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css("div.entry").extract()[0] # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # date = datetime.datetime.strptime(date, "%Y/%m/%d").date() # except Exception as e: # date = datetime.datetime.now().date() # article_item["create_date"] = date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = priase_nums # article_item["fav_nums"] = fav_nums # article_item["comment_nums"] = comment_nums # article_item["content"] = content #通过Itemloader加载 front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleAritcleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('fav_nums', 'span.bookmark-btn::text') item_loader.add_css('comment_nums', 'span.hide-on-480::text') item_loader.add_css('content', 'div.entry') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # 提取文章的具体字段 # article_item = JobboleArticleItem() ''' # xpath title = response.xpath('//*[@id="post-114159"]/div[1]/h1/text()').extract()[0] create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().strip().replace("·", "").strip() praise_nums = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]) fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] match_re = re.match(r".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] match_re = re.match(r".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.xpath("//div[@class='entry']").extract()[0] tags_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() tags_list = [element for element in tags_list if not element.strip().endswith("评论")] tags = ','.join(tags_list) ''' ''' # css front_image_url = response.meta.get("front_image_url", "") # 封面图,用get不会抛异常 title = response.css(".entry-header h1::text").extract()[0] create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().strip().replace("·", "").strip() try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() praise_nums = int(response.css(".vote-post-up h10::text").extract()[0]) fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(r".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] match_re = re.match(r".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] tags_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() tags_list = [element for element in tags_list if not element.strip().endswith("评论")] tags = ','.join(tags_list) article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content ''' # Itemloader加载item front_image_url = response.meta.get("front_image_url", "") # 封面图,用get不会抛异常 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css('title', ".entry-header h1::text") item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('url', response.url) item_loader.add_css('create_date', "p.entry-meta-hide-on-mobile::text") item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_css('praise_nums', ".vote-post-up h10::text") item_loader.add_css('comment_nums', "a[href='#article-comment'] span::text") item_loader.add_css('fav_nums', ".bookmark-btn::text") item_loader.add_css('tags', "p.entry-meta-hide-on-mobile a::text") item_loader.add_css('content', "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # 通过xpath提取文章的具体字段 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().replace(u"·","").strip() # praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract_first() # fav_nums = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract_first() # # fav_nums = re.sub('\D', '', fav_nums) # match_re = re.match(".*?(\d+).*?", fav_nums) # if match_re: # fav_nums = match_re.group(1) # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first() # comment_nums = re.sub('\D', '', comment_nums) # content = response.xpath('//div[@class="entry"]').extract_first() # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith(u'评论')] # tag = ','.join(tag_list) # 通过css选择器提取字段 # title = response.css(".entry-header h1::text").extract_first('') # front_image_url = response.meta.get("front_image_url", "") # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract_first('').replace(u"·","").strip() # praise_nums = response.css(".vote-post-up h10::text").extract_first('') # fav_nums = response.css(".bookmark-btn::text").extract_first('') # fav_nums = re.sub('\D', '', fav_nums) # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first('') # comment_nums = re.sub('\D', '', comment_nums) # content = response.css('div.entry').extract_first() # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith(u'评论')] # tags = ','.join(tag_list) # # article_item["title"] = title # article_item["url"] = response.url # article_item["url_object_id"] = get_md5(response.url) # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # 通过item_loader加载item front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", front_image_url) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # 调用这个方法来对规则进行解析生成item对象 article_item = item_loader.load_item() yield article_item
def pares_detail(self, response): article_item = JobBoleArticleItem() # 通过css选择器提到字段 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.css(".entry-header h1::text").extract() # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().split()[0] # praise_nums = int(response.css(".vote-post-up h10::text").extract()[0]) # fav_nums = response.css("span.bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*?", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css('div.entry').extract()[0] # # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_date = datetime.datetime.strptime(create_date,"%Y/%m/%d").date() # except Exception as e: # create_date = datetime,datetime.now().date() # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content #通过item loader加载item front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", "response.url") item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css('praise_nums', ".vote-post-up h10::text") item_loader.add_css('comment_nums', "a[href='#article-comment'] span::text") item_loader.add_css('fav_nums', "span.bookmark-btn::text") item_loader.add_css('tags', "p.entry-meta-hide-on-mobile a::text") item_loader.add_css('content', 'div.entry') article_item = item_loader.load_item() yield article_item