def parse_content(response): item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary_min", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years_min", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_content(response): jobbole_item = JobboleBlogItem() # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = JobboleBlogItemLoader(item=jobbole_item, response=response) # 通过css选择器将后面的指定规则进行解析。 item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # 调用这个方法来对规则进行解析生成item对象 jobbole_item = item_loader.load_item() # 已经填充好了值调用yield传输至pipeline yield jobbole_item
def parse_question(self, response): # 处理question页面, 从页面中提取出具体的question item if "QuestionHeader-title" in response.text: # 处理新版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("question_id", question_id) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_xpath( "content", "//*[@id='root']/div/main/div/div[1]/div[2]" "/div[1]/div[1]/div[2]/div/div/div/span/text()", ) item_loader.add_css( "topics", ".QuestionHeader-topics .Tag.QuestionTopic .Popover div::text" ) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") # 这里的watch_user_num 包含Watch 和 click 在clean data中分离 item_loader.add_css("watch_user_num", ".NumberBoard-itemValue ::text") item_loader.add_value("url", response.url) question_item = item_loader.load_item() else: # 处理老版本页面的item提取(好像已经没有老版页面了我这里放着保险一下) match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_xpath( "title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()", ) item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css( "comments_num", "#zh-question-meta-wrap a[name='addcomment']::text" ) item_loader.add_xpath( "watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|" "//*[@class='zh-question-followers-sidebar']/div/a/strong/text()", ) item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() # 发起向后台具体answer的接口请求 yield scrapy.Request( self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer, ) yield question_item
def parse_content(response): item_loader = ZhilianItemLoader(item=ZhilianItem(), response=response) item_loader.add_css("title", "div.main1-stat h1.info-h3::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary_min", "div.main1-stat div.info-money>strong::text") item_loader.add_css( "job_city", "div.main1-stat .info-three>span:nth-child(1)>a::text") item_loader.add_css( "work_years_min", "div.main1-stat .info-three>span:nth-child(2)::text") item_loader.add_css( "degree_need", "div.main1-stat .info-three>span:nth-child(3)::text") item_loader.add_xpath("job_advantage", "//script[1]") item_loader.add_css("job_desc", "div.responsibility > div.pos-ul") item_loader.add_css("job_addr", "div.work-add > p.add-txt::text") item_loader.add_css("company_name", "div.company > a::text") item_loader.add_css("company_url", "div.company > a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_content(response): item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("tags", ".position-label li::text") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_content(response): item_loader = W51JobItemLoader(item=W51JobItem(), response=response) item_loader.add_css("title", "div.cn > h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary_min", "div.cn > strong::text") item_loader.add_css("job_city", "p.msg.ltype::text") item_loader.add_css("job_advantage", "div.tHeader.tHjob div.t1>.sp4::text") item_loader.add_css("job_desc", "div.tCompany_main div.job_msg") item_loader.add_xpath( "job_addr", "//*[@class='tBorderTop_box'][2]/div/p[@class='fp']/text()") item_loader.add_css("company_name", "p.cname>a.catn::attr(title)") item_loader.add_css("company_url", "p.cname>a.catn::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_answer(self, response): # 处理question的answer ans_json = json.loads(response.text) is_end = ans_json["paging"]["is_end"] next_url = ans_json["paging"]["next"] # 提取answer的具体字段 for answer in ans_json["data"]: url_object_id = get_md5(url=answer["url"]) answer_id = answer["id"] question_id = answer["question"]["id"] author_id = answer["author"]["id"] if "id" in answer["author"] else None author_name = ( answer["author"]["name"] if "name" in answer["author"] else None ) content = answer["excerpt"] if "excerpt" in answer else "" really_url = "https://www.zhihu.com/question/{0}/answer/{1}".format( answer["question"]["id"], answer["id"] ) create_time = answer["created_time"] updated_time = answer["updated_time"] yield scrapy.Request( really_url, headers=self.headers, callback=self.parse_answer_end, meta={ "url_object_id": url_object_id, "answer_id": answer_id, "question_id": question_id, "author_id": author_id, "author_name": author_name, "content": content, "create_time": create_time, "updated_time": updated_time, }, ) if not is_end: yield scrapy.Request( next_url, headers=self.headers, callback=self.parse_answer )