Ejemplo n.º 1
0
 def step1(self, params):
     """获取评论的首页url"""
     try:
         #获取上一次的最新更新时间
         before_update = CMTStorage.getlastpublish(params.originalurl)
         #获取其他信息,拼接url
         url_id = None
         if self.r.search('^http[s]{0,1}://v\.qq\.com/.*',
                          params.originalurl):
             #{"comment_id":"1167760750","result":{"code":0,"msg":"Success!","ret":0},"srcid":"c0016r7fo07","srcid_type":1001}
             url_id = self.r.getid('comment_id', params.content)
         else:
             url_id = self.r.getid('cmt_id', params.content)
             if not url_id:
                 url_id = self.r.getid('aid', params.content)
             if not url_id:
                 url_id = self.r.getid('commId', params.content)
         if url_id:
             comment_url = self.COMMENTS_URL.format(url_id, 0,
                                                    self.page_size)
             self.storeurl(
                 comment_url, params.originalurl,
                 self.STEP_COMMENT_NEXT_PAGE, {
                     'url_id': url_id,
                     'comment_id': 0,
                     'before_update': before_update
                 })
     except:
         Logger.printexception()
Ejemplo n.º 2
0
    def step2(self, params):
        try:
            Logger.getlogging().info("Kr36Comments.STEP_2")
            # 将STEP_1中的cid传下来
            cid = params.customized['cid']

            jsoncontent = json.loads(params.content)
            comments_count = jsoncontent['data']['total_items']
            page_count = jsoncontent['data']['total_pages']
            # 判断增量
            cmtnum = CMTStorage.getcount(params.originalurl)
            if cmtnum >= comments_count:
                return

            #最多只取十页评论
            # page_num = int(math.ceil(float(comments_count - cmtnum) / self.page_size))
            if page_count >= self.maxpages:
                page_count = self.maxpages
            lasttime = CMTStorage.getlastpublish(params.originalurl,True)

            for page in range(1, page_count+1, 1):
                commentinfo_url = Kr36Comments.COMMENT_URL.format(cid, self.page_size, page)
                self.storeurl(commentinfo_url, params.originalurl, Kr36Comments.STEP_3,lasttime)
        except:
            Logger.printexception()
Ejemplo n.º 3
0
    def process_book(self, params):
        try:
            if params.step == Comments.STEP_1:
                # 从url中获取拼接评论url的参数
                bookId = self.r.parse('^http://www\.17k\.com/book/(\w+).html$',
                                      params.originalurl)[0]
                # 拼接第一页评论url
                comments_url = Comments.COMMENTS_URL % (bookId, 1,
                                                        Comments.PAGE_SIZE)
                #通知下载平台,根据评论url获取第一页评论内容
                self.storeurl(comments_url, params.originalurl,
                              Comments.STEP_2, {'bookId': bookId})

            #获取第一页评论内容,循环获取全部评论url
            elif params.step == Comments.STEP_2:
                bookId = params.customized['bookId']
                # 获取评论的Jason返回值
                comments = json.loads(params.content)

                comments_count = int(comments['page']['count'])
                # 判断增量
                cmtnum = CMTStorage.getcount(params.originalurl)
                if cmtnum >= comments_count:
                    return
                NewsStorage.setcmtnum(params.originalurl, comments_count)
                # 获取评论最后更新时间
                lasttime = CMTStorage.getlastpublish(params.originalurl, True)
                # 获取评论页数
                page_count = int(comments['page']['pagecount'])
                if page_count == 0:
                    return

                if page_count >= self.maxpages:
                    page_count = self.maxpages

                # 循环拼接评论url,提交下载平台获取评论数据
                for page in range(1, page_count + 1, 1):
                    commentUrl = Comments.COMMENTS_URL % (bookId, page,
                                                          Comments.PAGE_SIZE)
                    self.storeurl(commentUrl, params.originalurl,
                                  Comments.STEP_3, {'bookId': bookId})

            #解析评论数据
            elif params.step == Comments.STEP_3:
                commentsinfo = json.loads(params.content)

                for comment in commentsinfo['page']['result']:
                    curtime = TimeUtility.getuniformtime(
                        comment['creationDate'])
                    content = comment['summary']
                    nick = comment['marks']['nikeName']
                    if not CMTStorage.exist(params.originalurl, content,
                                            curtime, nick):
                        CMTStorage.storecmt(params.originalurl, content,
                                            curtime, nick)

        except Exception, e:
            traceback.print_exc()
Ejemplo n.º 4
0
    def process(self, params):
        try:
            if params.step is YoukuComments.STEP_1:
                # 从url中获取拼接评论url的参数
                objectId = self.r.getid('videoId', params.content, '\s*:\s*"')
                pTime = str(
                    int(
                        time.mktime(
                            datetime.datetime.timetuple(
                                datetime.datetime.now())) * 1000))
                #获取参数中的随机数
                sign = MD5().m(
                    '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime)
                # 拼接第一页评论url
                comments_url = YoukuComments.COMMENTS_URL % (
                    objectId, 1, YoukuComments.PAGE_SIZE, sign, pTime)
                #通知下载平台,根据评论url获取第一页评论内容
                self.storeurl(comments_url, params.originalurl,
                              YoukuComments.STEP_2, {'objectId': objectId})

                # 来疯吧直播播放量
                if self.r.search(r'^http://v\.laifeng\.com/\d+',
                                 params.originalurl):
                    clicknum = int(self.r.getid('onlineNum', params.content))
                    NewsStorage.setclicknum(params.originalurl, clicknum)

                if objectId:
                    playinfo_url = YoukuComments.PLAYINFO_URL.format(
                        vid=objectId)
                    self.storeurl(playinfo_url, params.originalurl,
                                  YoukuComments.STEP_2, {'objectId': objectId})
            #获取第一页评论内容,循环获取全部评论url
            elif params.step == YoukuComments.STEP_2:
                if re.findall('getVideoPlayInfo\?vid', params.url):
                    playinfo = json.loads((params.content)[20:-2])
                    clicknum = int(playinfo['data']['stat']['vv'].replace(
                        ',', ''))
                    votenum = int(playinfo['data']['updown']['up'].replace(
                        ',', ''))
                    NewsStorage.setclicknum(params.originalurl, clicknum)
                    NewsStorage.setvotenum(params.originalurl, votenum)
                else:
                    objectId = params.customized['objectId']
                    pTime = str(
                        int(
                            time.mktime(
                                datetime.datetime.timetuple(
                                    datetime.datetime.now())) * 1000))
                    # 获取参数中的随机数
                    sign = MD5().m(
                        '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' +
                        pTime)
                    # 获取评论的Jason返回值
                    comments = json.loads(params.content)
                    # 比较上次抓取该url的页面评论量和当前取到的评论量
                    if not comments.has_key('data'):
                        Logger.getlogging().warning(
                            "{url}:30000 No comments!".format(
                                url=params.originalurl))
                        return
                    if not comments['data']:
                        Logger.getlogging().warning(
                            "{url}:30000 No comments!".format(
                                url=params.originalurl))
                        return

                    # 判断增量
                    comments_count = comments['data']['totalSize']
                    cmtnum = CMTStorage.getcount(params.originalurl, True)
                    if int(comments_count <= cmtnum):
                        return
                    NewsStorage.setcmtnum(params.originalurl, comments_count)

                    # 获取评论总页数
                    comments_pages = int(comments['data']['totalPage'])
                    if comments_pages == 0:
                        return
                    # 如果评论数量过多只取前十页
                    if comments_pages >= self.maxpages:
                        comments_pages = self.maxpages

                    lasttime = CMTStorage.getlastpublish(
                        params.originalurl, True)
                    # 循环拼接评论url,提交下载平台获取评论数据
                    for page in range(0, comments_pages + 1, 1):
                        commentUrl = YoukuComments.COMMENTS_URL % (
                            objectId, page + 1, YoukuComments.PAGE_SIZE, sign,
                            pTime)
                        self.storeurl(commentUrl, params.originalurl,
                                      YoukuComments.STEP_3,
                                      {'objectId': objectId})

                    NewsStorage.setcmtnum(params.originalurl,
                                          int(comments['data']['totalSize']))

            #解析评论数据
            elif params.step == YoukuComments.STEP_3:
                commentsinfo = json.loads(params.content)
                for comment in commentsinfo['data']['comment']:
                    content = str(comment['content'])
                    curtime = TimeUtility.getuniformtime(
                        int(comment['createTime']))
                    nick = comment['user']['userName']
                    # 通过时间判断评论增量
                    # if curtime > lasttime:
                    if not CMTStorage.exist(params.originalurl, content,
                                            curtime, nick):
                        CMTStorage.storecmt(params.originalurl, content,
                                            curtime, nick)
        except:
            Logger.printexception()
Ejemplo n.º 5
0
    def process(self, params):
        try:
            if params.step is TudouComments.STEP_1:
                # 从url中获取拼接评论url的参数
                objectId = self.r.getid('vid', params.content, '\s*:\s*"')
                pTime = str(
                    int(
                        time.mktime(
                            datetime.datetime.timetuple(
                                datetime.datetime.now())) * 1000))
                # 获取参数中的随机数
                sign = MD5().m(
                    '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime)
                # 拼接第一页评论url
                comments_url = TudouComments.COMMENTS_URL % (
                    objectId, 1, TudouComments.PAGE_SIZE, sign, pTime)
                # 通知下载平台,根据评论url获取第一页评论内容
                self.storeurl(comments_url, params.originalurl,
                              TudouComments.STEP_2, {'objectId': objectId})

            elif params.step is TudouComments.STEP_2:

                objectId = params.customized['objectId']
                pTime = str(
                    int(
                        time.mktime(
                            datetime.datetime.timetuple(
                                datetime.datetime.now())) * 1000))
                # 获取参数中的随机数
                sign = MD5().m(
                    '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime)
                # 获取评论的Jason返回值
                comments = json.loads(params.content)
                # 比较上次抓取该url的页面评论量和当前取到的评论量
                if not comments.has_key('data'):
                    Logger.getlogging().warning(
                        "{url}:30000 No comments!".format(
                            url=params.originalurl))
                    return
                if not comments['data']:
                    Logger.getlogging().warning(
                        "{url}:30000 No comments!".format(
                            url=params.originalurl))
                    return
                # 判断增量
                comments_count = comments['data']['totalSize']
                cmtnum = CMTStorage.getcount(params.originalurl, True)
                if int(comments_count <= cmtnum):
                    return
                NewsStorage.setcmtnum(params.originalurl, comments_count)

                # 获取评论总页数
                comments_pages = int(comments['data']['totalPage'])
                if comments_pages == 0:
                    return
                # 如果评论数量过多只取前十页
                if comments_pages >= self.maxpages:
                    comments_pages = self.maxpages

                lasttime = CMTStorage.getlastpublish(params.originalurl, True)
                # 循环拼接评论url,提交下载平台获取评论数据
                for page in range(0, comments_pages + 1, 1):
                    commentUrl = TudouComments.COMMENTS_URL % (
                        objectId, page + 1, TudouComments.PAGE_SIZE, sign,
                        pTime)
                    self.storeurl(commentUrl, params.originalurl,
                                  TudouComments.STEP_3, {'objectId': objectId})

            elif params.step is TudouComments.STEP_3:
                commentsinfo = json.loads(params.content)
                for comment in commentsinfo['data']['comment']:
                    content = comment['content']
                    curtime = TimeUtility.getuniformtime(
                        int(comment['createTime']))
                    nick = comment['user']['userName']
                    # 通过时间判断评论增量
                    # if curtime > lasttime:
                    if not CMTStorage.exist(params.originalurl, content,
                                            curtime, nick):
                        CMTStorage.storecmt(params.originalurl, content,
                                            curtime, nick)
        except:
            Logger.printexception()
Ejemplo n.º 6
0
 def isnewesttime(self, url, curtime):
     if curtime > CMTStorage.getlastpublish(url):
         return True
     return False