def user_parse(self, response): """ 解析用户个人信息 """ infos = json.loads(response.body) url_token = infos['url_token'] user_item = ZhihuItem() user_item['user'] = infos yield user_item # 请求关注的人 following_count = infos['following_count'] if following_count > 0: yield scrapy.Request(self.user_followees_url.format(url_token), headers=self.headers, priority=10, callback=self.following_parse, meta={'children_url_token': url_token}) # 请求粉丝 follower_count = infos['follower_count'] if follower_count > 0: yield scrapy.Request(self.user_followers_url.format(url_token), headers=self.headers, priority=5, callback=self.followers_parse, meta={ 'parent_url_token': url_token, })
def parse(self, response): print(response.url) # 将json格式转换为python格式 data = json.loads(response.text)['data'] is_end = json.loads(response.text)['paging']['is_end'] for each in data: item = ZhihuItem() item['id'] = each['id'] item['name'] = each['name'] item['headline'] = each['headline'] item['gender'] = each['gender'] item['url_token'] = each['url_token'] yield scrapy.Request(url=self.quesUrl + item['url_token'] + '/following/questions', meta={'item': item, 'download_timeout': 10}, callback=self.getQuesHtml)
def parse_user(self, response): """ 解析用户详情页面 """ items = ZhihuItem() user_info = json.loads(response.text) if user_info: # 遍历items中的Field for field in items.fields: if field in user_info.keys(): items[field] = user_info.get(field) yield items # 从用户详情页抓取到关注者和关注的页面并发起请求 yield Request(self.follows_url.format(user=user_info.get('url-token'), include=self.follows_query, offset=0, limit=20), callback=self.parse_follower) yield Request(self.following_url.format(user=user_info.get('url-token'), include=self.following_query, offset=0, limit=20), callback=self.parse_following)
def parse_user(self, response): result = json.loads(response.text) item = ZhihuItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item yield Request(self.follows_url.format(user=result.get('url_token'), include=self.follows_query, offset=0, limit=20), callback=self.parse_user) yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, offset=0, limit=20), callback=self.parse_user)
def answer_parse(self, response): jsonBody = json.loads(response.body) self.anserws_meet_end = jsonBody['paging']['is_end'] if not self.anserws_meet_end: for item in jsonBody['data']: pipleitem = ZhihuItem() pipleitem['id'] = item['id'] pipleitem[ 'url'] = 'https://www.zhihu.com/question/{q_id}/answer/{a_id}'.format( q_id=item['question']['id'], a_id=item['id']) pipleitem['platform'] = '知乎' pipleitem['viewType'] = '问答' pipleitem['searchWord'] = response.meta['kw'] pipleitem['Title'] = item['question']['title'] pipleitem['crawlTime'] = self.get_localtime() pipleitem['publishTime'] = self.get_createtime( item['created_time']) pipleitem['level'] = 1 pipleitem['commentID'] = 1 pipleitem['comment_count'] = item['comment_count'] pipleitem['like'] = item['voteup_count'] pipleitem['authorName'] = item['author']['name'] pipleitem['authorID'] = item['author']['id'] pipleitem['Content'] = item['excerpt'] yield pipleitem if item['comment_count'] > 0: self.comment_meet_end = False self.comment_parmas['offset'] = 0 while not self.comment_meet_end: paramas = urllib.parse.urlencode(self.comment_parmas) url = 'https://www.zhihu.com/api/v4/answers/{id}/comments?{paramters}'.format( id=item['id'], paramters=paramas) yield Request(url=url, callback=self.comment_parse, meta={ 'answerid': item['id'], 'kw': response.meta['kw'], 'title': item['question']['title'] }) self.comment_parmas['offset'] += self.comment_parmas[ 'limit'] time.sleep(5) #--*-- 这个数值必须大于download delay值 --*-- # print('--------------------ANSWER--------------------\n',response.url,'\n',pipleitem,'\n--------------------ANSWER--------------------\n') else: return
def parse(self, response): item = ZhihuItem() # item['up_num'] = re.findall(r'"voteupCount":(\d+)', response.text)[0] # item['zhuanfa'] = re.findall(r'"voting":(\d+)', response.text)[0] # item['comment_num'] = re.findall(r'"commentCount":(\d+)', response.text)[0] # item['url'] = response.meta.get('url') # item['MonitorName'] = response.meta.get('MonitorName') # item['read_num'] = '' # print(item['up_num'], item['comment_num'], item['zhuanfa'], item['url']) # yield item item['comment_num'] = re.findall(r'"commentCount":(\d+)', response.text)[0] item['read_num'] = re.findall(r'"visitCount":(\d+)', response.text)[0] item['url'] = response.meta.get('url') item['MonitorName'] = response.meta.get('MonitorName') item['zhuanfa'] = '' item['up_num'] = '' print(item['read_num'], item['comment_num'], item['url']) yield item
def artical_parse(self, response): pipleitem = ZhihuItem() pipleitem['viewType'] = '文章' pipleitem['id'] = response.meta['id'] pipleitem['url'] = response.url pipleitem['platform'] = '知乎' pipleitem['searchWord'] = response.meta['kw'] pipleitem['Title'] = response.css('.Post-Header .Post-Title').xpath( 'string(.)').extract_first() pipleitem['crawlTime'] = self.get_localtime() created_secs = int( re.findall('"created":(\d*)', response.body.decode())[0]) pipleitem['publishTime'] = self.get_createtime(secs=created_secs) pipleitem['level'] = 1 pipleitem['authorName'] = response.css( '.AuthorInfo-name .UserLink-link').xpath('text()').extract_first() pipleitem['authorID'] = response.css( '.AuthorInfo-name .UserLink-link').xpath('@href').extract_first() pipleitem['commentID'] = 1 pipleitem['Content'] = response.css('#root .Post-RichText').xpath( 'string(.)').extract_first() # print('--------------------ARTICLE--------------------\n', response.url, '\n', pipleitem,'\n--------------------ARTICLE--------------------\n') return pipleitem
def parse_user(self, response): result = json.loads(response.text) item = ZhihuItem() # 获取数据 for field in item.fields: if field in result.keys(): item[field] = result.get(field) # 返回数据 yield item # 发送该用户的关注者api请求 yield Request( self.follows_url.format(user=result.get('url_token'), include=self.follow_include, limit=20, offset=0), self.parse_follows) # 发送该用户的粉丝api请求 yield Request( self.followers_url.format(user=result.get('url_token'), include=self.followers_include, limit=20, offset=0), self.parse_followers)
def comment_parse(self, response): jsonBody = json.loads(response.body) self.comment_meet_end = jsonBody['paging']['is_end'] if not self.comment_meet_end: for item in jsonBody['data']: pipleitem = ZhihuItem() pipleitem['id'] = response.meta['answerid'] pipleitem['commentID'] = item['id'] pipleitem['url'] = item['url'] pipleitem['platform'] = '知乎' pipleitem['viewType'] = '问答' pipleitem['searchWord'] = response.meta['kw'] pipleitem['Title'] = response.meta['title'] pipleitem['crawlTime'] = self.get_localtime() pipleitem['publishTime'] = self.get_createtime( item['created_time']) pipleitem['level'] = 2 pipleitem['like'] = item['vote_count'] pipleitem['authorName'] = item['author']['member']['name'] pipleitem['authorID'] = item['author']['member']['id'] pipleitem['Content'] = item['content'] # print('--------------------COMMENT--------------------\n',response.url,'\n',pipleitem,'\n--------------------COMMENT--------------------\n') yield pipleitem return
def parse_detail(self, response): list_Label = response.css(".ProfileHeader-detailLabel::text").extract() # 行业和个人简介 business_introduce = response.css(".ProfileHeader-detailValue::text").extract() introduce='' if len(business_introduce) == 2: business = business_introduce[0] introduce = business_introduce[1] list_Label.remove('所在行业') list_Label.remove('个人简介') elif len(business_introduce) == 1: for i in list_Label: if i == '所在行业': business = business_introduce[0] introduce = '' list_Label.remove('所在行业') elif i == '个人简介': business = '' introduce = business_introduce[0] list_Label.remove('个人简介') break else: business = '' introduce = '' # 所在地 place = response.css("div.ProfileHeader-detailValue span:nth-child(1)::text").extract() list_data = response.css( "div.ProfileHeader-detailValue div.ProfileHeader-field:nth-child(1)::text").extract() # 职位 job = '' # 学校 edu = '' if len(list_data) == 2: job = list_data[0] edu = list_data[1] elif len(list_data) == 1: for i in list_Label: if i == '职业经历': job = list_data[0] elif i == '教育经历': edu = list_data[0] elif len(list_data) == 0: job = '' edu = '' else: print("异常") # 数据清洗 if place: place = "".join(place) place = place.replace("现居", "") else: place = "".join(place) # 性别 gender = response.css("meta[itemprop*='gender']::attr(content)").extract() if gender: if gender[0] == 'Male': gender = 0 # 0->male else: gender = 1 # 1->female else: gender = -1 # -1未知 # 名称 u_name = response.css("span.ProfileHeader-name::text").extract_first() # 回答数 answerCount = response.css("li[aria-controls*='Profile-answers'] a span::text").extract_first() # 被关注数 followerCount = response.css("meta[itemprop*='followerCount']::attr(content)").extract_first() # 提问数 asks = response.css("li[aria-controls*='Profile-asks'] a span::text").extract_first() # 关注数 content = str(response.body) re_match = re.match(".*?<.*?title=\"(\d*)\">.*</strong>", content) if re_match: follwingCount = re_match.group(1) else: follwingCount = 0 url_token = response.meta['url_token'] # ZhihuItem类实例化Zhihu_item Zhihu_item = ZhihuItem() Zhihu_item["u_name"] = u_name # 1用户名称 Zhihu_item["follwingCount"] = follwingCount # 2用户关注的人数量 Zhihu_item["followerCount"] = followerCount # 3关注用户的人的数量 Zhihu_item["gender"] = gender # 4性别 Zhihu_item["edu"] = edu # 5教育经历 Zhihu_item["job"] = job # 6职业经历 Zhihu_item["place"] = place # 7居住地列表 Zhihu_item["url_token"] = [url_token] #8 关键字 Zhihu_item["asks"] = asks # 9提问数 Zhihu_item["answerCount"] = answerCount # 10回答数 Zhihu_item["introduce"] = introduce #11回答数 # item值填充 yield Zhihu_item