Example #1
0
 def parse_comment(self, response):
     comment_list = response.xpath('body/li')
     for i in range(len(comment_list)-1):
         comment = comment_list[i]
         data = json.loads(comment.attrib['data-field'])
         item = CommentItem()
         item['id'] = data['spid']
         item['author'] = data['user_name']
         item['post_id'] = response.meta['post_id']
         item['content'] = helper.parse_content(comment.css('.lzl_content_main').get())
         item['time'] = comment.css('.lzl_time').xpath('./text()').get()
         yield item
Example #2
0
 def parse_comment(self, response):
     comment_list = json.loads(
         response.body.decode('utf8'))['data']['comment_list']
     for value in comment_list.values():
         comments = value['comment_info']
         for comment in comments:
             item = CommentItem()
             item['id'] = comment['comment_id']
             item['author'] = comment['username']
             item['post_id'] = comment['post_id']
             item['content'] = helper.parse_content(comment['content'])
             item['time'] = time.strftime(
                 "%Y-%m-%d %H:%M:%S", time.localtime(comment['now_time']))
             yield item
Example #3
0
 def parse_totalComment(self, response):
     meta = response.meta.copy()
     comment_list = json.loads(response.text)['data']['comment_list']
     if not comment_list:
         return
     for value in comment_list.values():
         comments = value['comment_info']
         for comment in comments:
             item = CommentItem()
             item['id'] = comment['comment_id']
             item['author'] = comment['username']
             item['post_id'] = meta['post_id'] = comment['post_id']
             item['content'] = helper.parse_content(comment['content'])
             item['time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(comment['now_time']))
             yield item
         comment_pages = ceil(value['comment_num'] / 10.0)
         for i in range(1, comment_pages): # other pages
             url = "https://tieba.baidu.com/p/comment?tid=%d&pid=%d&pn=%d" % (meta['thread_id'], item['post_id'], i+1) 
             yield scrapy.Request(url, callback = self.parse_comment, meta = meta)
Example #4
0
 def parse_comment(self, response):
     comment_list = json.loads(
         response.body_as_unicode())['data']['comment_list']
     for value in comment_list.values():
         comments = value['comment_info']
         for comment in comments:
             item = CommentItem()
             item['comment_id'] = comment['comment_id']
             item['author'] = comment['username']
             item['post_id'] = comment['post_id']
             item['content'] = helper.parse_content(comment['content'],
                                                    False)
             item['time'] = time.strftime(
                 "%Y-%m-%d %H:%M:%S", time.localtime(comment['now_time']))
             item['user_id'] = comment['user_id']
             yield item
             url = 'http://tieba.baidu.com/home/main?un=%s' % comment[
                 'username']
             yield scrapy.Request(url, callback=self.parse_user)
Example #5
0
    def parse_comment(self, response):
        comment_list = json.loads(
            response.body.decode('utf8'))['data']['comment_list']

        meta = response.meta

        # total_commont_floor_num = 0

        # 递归调用时候增加总数
        # if meta.has_key('total_commont_floor_num'):
        #     print("已经有了统计的总数:{}".format(meta['total_commont_floor_num']))
        #     total_commont_floor_num = meta['total_commont_floor_num']

        for value in comment_list.values():
            comments = value['comment_info']
            for comment in comments:
                #添加总共的评论
                # total_commont_floor_num+=1
                item = CommentItem()
                # 评论的贴吧账号id
                item['tiebaAccountId'] = comment['user_id']
                # 内容
                item['content'] = helper.parse_content(comment['content'])
                #外部id
                item['outContentId'] = comment['comment_id']
                #贴吧id 就是贴吧名字
                item['tiebaInfoId'] = meta['tiebaInfoId']
                # 发布时间
                item['publishTime'] = time.strftime(
                    "%Y-%m-%d %H:%M:%S", time.localtime(comment['now_time']))
                # item['author'] = comment['username']
                # 帖子id
                item['threadId'] = meta['threadId']
                #楼中楼的 对应的 楼层id
                item['postId'] = comment['post_id']
                yield item