def parse_comment(self, response): comment_list = response.xpath('body/li') for i in range(len(comment_list)-1): comment = comment_list[i] data = json.loads(comment.attrib['data-field']) item = CommentItem() item['id'] = data['spid'] item['author'] = data['user_name'] item['post_id'] = response.meta['post_id'] item['content'] = helper.parse_content(comment.css('.lzl_content_main').get()) item['time'] = comment.css('.lzl_time').xpath('./text()').get() yield item
def parse_comment(self, response): comment_list = json.loads( response.body.decode('utf8'))['data']['comment_list'] for value in comment_list.values(): comments = value['comment_info'] for comment in comments: item = CommentItem() item['id'] = comment['comment_id'] item['author'] = comment['username'] item['post_id'] = comment['post_id'] item['content'] = helper.parse_content(comment['content']) item['time'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(comment['now_time'])) yield item
def parse_totalComment(self, response): meta = response.meta.copy() comment_list = json.loads(response.text)['data']['comment_list'] if not comment_list: return for value in comment_list.values(): comments = value['comment_info'] for comment in comments: item = CommentItem() item['id'] = comment['comment_id'] item['author'] = comment['username'] item['post_id'] = meta['post_id'] = comment['post_id'] item['content'] = helper.parse_content(comment['content']) item['time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(comment['now_time'])) yield item comment_pages = ceil(value['comment_num'] / 10.0) for i in range(1, comment_pages): # other pages url = "https://tieba.baidu.com/p/comment?tid=%d&pid=%d&pn=%d" % (meta['thread_id'], item['post_id'], i+1) yield scrapy.Request(url, callback = self.parse_comment, meta = meta)
def parse_comment(self, response): comment_list = json.loads( response.body_as_unicode())['data']['comment_list'] for value in comment_list.values(): comments = value['comment_info'] for comment in comments: item = CommentItem() item['comment_id'] = comment['comment_id'] item['author'] = comment['username'] item['post_id'] = comment['post_id'] item['content'] = helper.parse_content(comment['content'], False) item['time'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(comment['now_time'])) item['user_id'] = comment['user_id'] yield item url = 'http://tieba.baidu.com/home/main?un=%s' % comment[ 'username'] yield scrapy.Request(url, callback=self.parse_user)
def parse_comment(self, response): comment_list = json.loads( response.body.decode('utf8'))['data']['comment_list'] meta = response.meta # total_commont_floor_num = 0 # 递归调用时候增加总数 # if meta.has_key('total_commont_floor_num'): # print("已经有了统计的总数:{}".format(meta['total_commont_floor_num'])) # total_commont_floor_num = meta['total_commont_floor_num'] for value in comment_list.values(): comments = value['comment_info'] for comment in comments: #添加总共的评论 # total_commont_floor_num+=1 item = CommentItem() # 评论的贴吧账号id item['tiebaAccountId'] = comment['user_id'] # 内容 item['content'] = helper.parse_content(comment['content']) #外部id item['outContentId'] = comment['comment_id'] #贴吧id 就是贴吧名字 item['tiebaInfoId'] = meta['tiebaInfoId'] # 发布时间 item['publishTime'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(comment['now_time'])) # item['author'] = comment['username'] # 帖子id item['threadId'] = meta['threadId'] #楼中楼的 对应的 楼层id item['postId'] = comment['post_id'] yield item