def parse_comment(self, response): #得到api返回的数据 json_data = response.text #获取pid pid = response.meta['pid'] #获取page的值 page = response.meta['page'] data = json.loads(json_data) total_page = data['data']['total_page'] data_list = data['data']['list'] #遍历接口数据,然后填充到item for one_data in data_list: #实例化item类 item = CommentItem() item['commentid'] = one_data['commentid'] item['pid'] = one_data['articleid'] item['cid'] = one_data['userInfo']['userid'] item['created_at'] = one_data['addtime'] item['content'] = one_data['content'] item['like_counts'] = num_to_int(one_data['count_approve']) #提交item yield item #执行循环遍历接口,得到数据 while page <= total_page: page = page + 1 # 把新的url传给解析方法,并解析 request = scrapy.Request(comment_api % (page, pid), callback=self.parse_comment) request.meta['page'] = page request.meta['pid'] = pid yield request
def parse_comment(self, response): # 因为是直接请求的接口,返回的都是json格式,直接用json.loads加载成python对象 result = json.loads(response.text) # 遍历评论列表 comments = result['data']['list'] for c in comments: comment = CommentItem() # 评论内容 comment['content'] = c['content'] # 评论ID comment['commentid'] = c['commentid'] # 作品ID comment['pid'] = c['articleid'] # 评论发表时间 comment['created_at'] = c['addtime'] # 评论被点赞的次数 comment['like_counts'] = c['count_approve'] # 发表评论的用户ID comment['cid'] = c['userInfo']['userid'] # 发表评论的用户名称 comment['uname'] = c['userInfo']['username'] # 发表评论的用户头像 comment['avatar'] = c['userInfo']['face'] # 如果本条评论是回复另一条评论,则reply不为空 if c['reply']: # 把被回复的那条评论ID存在reply字段 comment['reply'] = c['reply']['commentid'] or 0 yield comment # 是否还有下一页评论 next_page = result['data']['next_page_url'] if next_page: yield Request(next_page, callback=self.parse_comment)
def parse_comment(self, response): if response.text: total_pages = response.xpath('//li[last()]/@data-totalpages').get() print('-' * 50, total_pages) cur_page = response.meta['cur_page'] pid = response.meta['pid'] if total_pages and total_pages.isdigit(): total_pages = int(total_pages) if total_pages > cur_page: request = Request(comment_api % (pid, cur_page + 1), callback=self.parse_comment) request.meta['pid'] = pid request.meta['cur_page'] = cur_page + 1 yield request comments = response.xpath('//li') for comment in comments: c = CommentItem() user_page = '%s%s' % (self.root_url, comment.xpath('./a[1]/@href').get()) request = Request(user_page, callback=self.parse_composer) yield request c['cid'] = user_page[2:] c['pid'] = pid c['created_at'] = comment.xpath( './/span[contains(@class,"send-time")]/text()').get() c['content'] = comment.xpath( './/div[contains(@class,"comment-con")]/text()').get() c['like_counts'] = comment.xpath( './/i[@class="counts"]/text()').get() yield c
def parse_comment(self, response): """解析评论接口""" resp = json.loads(response.text) composer_url = 'http://www.xinpianchang.com/u%s?from=articleList' for c in resp['data']['list']: comment = CommentItem() comment['commentid'] = c['commentid'] comment['pid'] = c['articleid'] comment['content'] = c['content'] comment['created_at'] = c['addtime_int'] comment['cid'] = c['userInfo']['userid'] comment['uname'] = c['userInfo']['username'] comment['avatar'] = c['userInfo']['face'] comment['like_counts'] = c['count_approve'] # 如果有reply字段,说明本条评论是回复的另一条评论 if c['reply']: # 把reply字段设置为被回复那条评论的ID comment['reply'] = c['reply']['commentid'] yield comment request = Request(composer_url % comment['cid'], callback=self.parse_composer) request.meta['cid'] = comment['cid'] yield request # 判断是否还需要翻页 next_page = resp['data']['next_page_url'] if next_page: yield response.follow(next_page, self.parse_comment)
def parse_comment(self, response): if response.text: pid = response.meta['pid'] # 下载页面的json数据 result = json.loads(response.text) # 提取下一页的网址 next_page = result['data']['next_page_url'] if next_page: request = Request(next_page, callback=self.parse_comment) request.meta['pid'] = pid yield request comments = result['data']['list'] for c in comments: comment = CommentItem() comment['commentid'] = c['commentid'] comment['pid'] = pid comment['cid'] = c['userInfo']['userid'] comment['uname'] = c['userInfo']['username'] comment['avatar'] = c['userInfo']['face'] comment['created_at'] = int(c['addtime_int']) comment['content'] = c['content'] comment['like_counts'] = ci(c['count_approve']) if c['reply']: comment['reply'] = c['reply']['commentid'] or 0 yield comment request = Request('%s/u%s' % (self.root_url, comment['cid']), callback=self.parse_composer) request.meta['cid'] = comment['cid'] yield request
def parse_comment(self, response): if response.text: # total_pages = response.xpath('//li[last()]/@data-totalpages').get() # print('-' * 50, total_pages) # cur_page = response.meta['cur_page'] pid = response.meta['pid'] result = json.loads(response.text) next_page = result['data']['next_page_url'] if next_page: request = Request(next_page, callback=self.parse_comment) request.meta['pid'] = pid yield request # if total_pages and total_pages.isdigit(): # total_pages = int(total_pages) # if total_pages > cur_page: # request = Request(comment_api % (pid, cur_page + 1), callback=self.parse_comment) # request.meta['pid'] = pid # request.meta['cur_page'] = cur_page + 1 # yield request # comments = response.xpath('//li') comments = result['data']['list'] for c in comments: comment = CommentItem() # user_page = '%s%s' % (self.root_url, comment.xpath('./a[1]/@href').get()) # user_id = comment.xpath('//span[@class="head-wrap"]/@data/userid').get() # request = Request(user_page, callback=self.parse_composer) # request.meta['cid'] = user_id # yield request # c['cid'] = request.meta['cid'] # c['pid'] = pid # c['created_at'] = comment.xpath('.//span[contains(@class,"send-time")]/text()').get() # c['content'] = comment.xpath('.//div[contains(@class,"comment-con")]/text()').get() # c['like_counts'] = comment.xpath('.//i[@class="counts"]/text()').get() # yield c comment['commentid'] = c['commentid'] comment['pid'] = pid comment['cid'] = c['userInfo']['userid'] comment['uname'] = c['userInfo']['username'] comment['avatar'] = c['userInfo']['face'] comment['created_at'] = int(c['addtime_int']) comment['content'] = c['content'] comment['like_counts'] = ci(c['count_approve']) if c['reply']: comment['reply'] = c['reply']['commentid'] or 0 yield comment request = Request('%s/u%s' % (self.root_url, comment['cid']), callback=self.parse_composer) request.meta['cid'] = comment['cid'] yield request
def parse_comment(self, response): res = json.loads(response.text) comment = CommentItem() if res['data']: comment_list = res['data']['list'] for ct in comment_list: comment['cid'] = ct['id'] comment['content'] = ct['content'] comment['avatar'] = ct['userInfo']['avatar'] comment['uname'] = ct['userInfo']['username'] comment['add_time'] = time.strftime( '%Y-%m-%d %H-%M', time.localtime(ct['addtime'])) yield comment next_url = res['data']['next_page_url'] if next_url: response.follow(next_url, self.parse_comment)
def parse_comments(self, response): """处理评论接口""" resp = json.loads(response.text) for c in resp['data']['list']: comment = CommentItem() comment['id'] = c['id'] comment['content'] = c['content'] comment['created_at'] = c['addtime'] comment['pid'] = c['resource_id'] comment['cid'] = c['userid'] comment['avatar'] = c['userInfo']['avatar'] comment['uname'] = c['userInfo']['username'] comment['like_counts'] = c['count_approve'] comment['referid'] = c['referid'] yield comment next_page_url = resp['data']['next_page_url'] if next_page_url: yield response.follow(next_page_url, callback=self.parse_comments)
def parse_comment(self, response): comment = CommentItem() result = json.loads(response.text) for i in result['data']['list']: comment['uname'] = i['userInfo']['username'] comment['cid'] = i['userInfo']['id'] comment['avatar'] = i['userInfo']['avatar'] comment['commentid'] = i['id'] comment['pid'] = i['resource_id'] comment['content'] = i['content'] comment['created_at'] = i['addtime'] comment['like_counts'] = i['count_approve'] if i['referid'] != 0: comment['reply'] = i['referid'] or 0 yield comment next_page = result['data']['next_page_url'] if next_page is not None: yield response.follow('https://app.xinpianchang.com%s' % next_page, self.parse_comment)
def parse_comment(self, response): result = json.loads(response.text) comments = result['data']['list'] for c in comments: comment = CommentItem() comment['commentid'] = c['commentid'] comment['pid'] = c['articleid'] comment['cid'] = c['userInfo']['userid'] comment['avatar'] = c['userInfo']['face'] comment['uname'] = c['userInfo']['username'] comment['created_at'] = c['addtime'] comment['content'] = c['content'] comment['like_counts'] = c['count_approve'].replace(',', '') if c['reply']: comment['reply'] = c['reply']['commentid'] yield comment next_page = result['data']['next_page_url'] if next_page: yield response.follow(next_page)
def parse_comment(self, response): result = json.loads(response.text) comments = result['data']['list'] for c in comments: comment = CommentItem() comment['commentid'] = c['commentid'] comment['pid'] = response.meta['pid'] comment['cid'] = c['userInfo']['userid'] comment['uname'] = c['userInfo']['username'] comment['avatar'] = c['userInfo']['face'] comment['created_at'] = c['addtime'] comment['content'] = c['content'] comment['like_counts'] = c['count_approve'] if c['reply']: comment['reply'] = c['reply']['commentid'] yield comment next_page = result['data']['next_page_url'] if next_page: request = Request(next_page, callback=self.parse_comment) request.meta['pid'] = response.meta['pid'] yield request
def parse_comment(self,response): comments = json.loads(response.text) # from scrapy.shell import inspect_response 相当于断点 # inspect_response(response,self) composer_url = 'http://www.xinpianchang.com/u%s?from=articleList' for c in comments['data']['list']: comment = CommentItem() comment['commentid'] = c['commentid'] comment['pid'] = c['articleid'] comment['content'] = c['content'] comment['created_at'] = c['addtime_int'] comment['cid'] = c['userInfo']['userid'] comment['uname'] = c['userInfo']['username'] comment['avatar'] = c['userInfo']['face'] comment['like_counts'] = c['count_approve'] if c['reply']: comment['reply'] = c['reply']['commentid'] yield comment request = Request(composer_url % comment['cid'],callback=self.parse_composer) request.meta['cid'] = comment['cid'] yield request
def parse_comment(self, response): """处理评论的接口""" resp = json.loads(response.text) comment_list = resp['data']['list'] for comment in comment_list: c = CommentItem() c['commentid'] = comment['commentid'] c['pid'] = comment['articleid'] c['content'] = comment['content'] c['created_at'] = comment['addtime_int'] c['like_counts'] = ci(comment['count_approve']) c['cid'] = comment['userInfo']['userid'] c['avatar'] = comment['userInfo']['face'] c['uname'] = comment['userInfo']['username'] # 判断本条评论是否是回复的之前的评论 if comment['reply']: # 将reply字段设置为被回复的评论ID c['reply'] = comment['reply']['commentid'] yield c next_page = resp['data']['next_page_url'] if next_page: yield response.follow(next_page, self.parse_comment)
def parse_comment(self,response): # 由于comment传递参数是一个json形式的文件,使用json.loads来获取text pageinfo = json.loads(response.text) list = pageinfo['data']['list'] # 获取链接中的list进行循环获取 for li in list: comment = CommentItem() comment["commentid"] = li['id'] comment['cid'] = li['userInfo']['id'] comment['pid'] = li['resource_id'] comment["uname"] = li['userInfo']['username'] # 多层字典结构获取 comment['avatar'] = li['userInfo']['avatar'] comment['created_at'] = li['addtime'] comment['like_counts'] = li['count_approve'] comment["content"] = li['content'] yield comment next_page = pageinfo['data']['next_page_url'] comment_link = 'https://app.xinpianchang.com' if next_page is not None: next_page_link = comment_link + next_page print('-------------------next_page--------------------') yield response.follow(next_page_link, self.parse_comment)