Beispiel #1
0
    def analyse(self, src):
        # 初始化一个row,不然极端情况下程序会崩溃
        row = []  # 保存一篇投稿的评论
        strACid = int(src.get_id())
        acid = strACid
        # 番剧的id小于0
        if acid > 0:
            url = "http://www.acfun.tv/comment_list_json.aspx?contentId=" + str(acid) + "&currentPage=1"
        else:
            url = "http://www.acfun.tv/comment/bangumi/web/list?bangumiId=" + str(-acid) + "&pageNo=1"

        jsonContent = self.sendGet(url)
        if not self.checkURL(jsonContent):
            logging.warning("connect acfun comments fail")
            return

        try:
            j_obj = json.loads(jsonContent)
        except Exception:
            logging.warning("get acfun comments fail")
            return

        # 番剧的id小于0
        try:
            if acid > 0:
                json_data = j_obj["commentContentArr"]
            else:
                json_data = j_obj["data"]["commentContentArr"]
        except:
            logging.error("commentContentArr is not exist")
            return

        # 偶尔会出现找不到commentContentArr的情况
        try:
            # 开始解析json评论
            for m, n in enumerate(json_data):
                comment = ACcommentsPO()  # 保存一条评论的内容
                comment.set_acid(int(acid))  # 抓取投稿编号
                comment.set_cid(int(json_data[n]["cid"]))  # 抓取评论cid
                comment.set_content(json_data[n]["content"])  # 抓取评论内容
                comment.set_user_name(json_data[n]["userName"])  # 抓取评论人用户名
                comment.set_layer(int(json_data[n]["count"]))  # 抓取该评论楼层数
                userID = int(json_data[n]["userID"])  # 抓取评论人用户ID

                # 司机判断
                self.checkSIJI(comment)

                # 删除判断
                self.checkDelete(comment, userID)

                # 时间戳
                comment.set_check_time(str(datetime.datetime.now()))

                # 数据下盘时间需要商量一下
                row.append(comment)

                # 不能浪费太多时间在拥有超大评论量的投稿上
                if m > 3000:
                    logging.error("over 3000, drop it.")
                    break

        except Exception:
            logging.error("commentContentArr is not exist")
            return

        return row
Beispiel #2
0
    def load_data(self, data):
        row = ACcommentsPO()
        row.set_cid(int(data[0]))
        row.set_content(data[1])
        row.set_user_name(data[2])
        row.set_layer(int(data[3]))
        row.set_acid(int(data[4]))
        row.set_delete(int(data[5]))
        row.set_siji(int(data[6]))
        row.set_check_time(data[7])

        return row
Beispiel #3
0
    def analyse(self, src):
        #初始化一个row,不然极端情况下程序会崩溃
        row = [] #保存一篇投稿的评论
        strACid = str(src.get_url())
        acid = strACid[strACid.find("/ac")+3:]
        acid = self.clear_acid(acid)
        url = "http://www.acfun.tv/comment_list_json.aspx?contentId=" + acid + "&currentPage=1"
        #urlCommentTimeA = time.time() #性能统计
        jsonContent = self.sendGet(url)
        flag = True
        #urlCommentTimeB = time.time() #性能统计
        #print("获取评论源代码用时:" + str(urlCommentTimeB - urlCommentTimeA))
        if not self.checkURL(jsonContent):
            logging.warning("connect acfun comments fail")
            '''
            try:
                refresh_data.remove(acid)
            except:
                continue
            '''    
            return 
        
        try:
            j_obj = json.loads(jsonContent)
        except Exception as e:
            logging.warning("get acfun comments fail")
            '''
            try:
                refresh_data.remove(acid)
            except:
                continue
            '''
            return 
        
		#偶尔会出现找不到commentContentArr的情况
        try:
            #开始解析json评论
            for m, n in enumerate(j_obj["commentContentArr"]):
                comment = ACcommentsPO() #保存一条评论的内容
                
                comment.set_acid(int(acid)) #抓取投稿编号            
                comment.set_cid(int(j_obj["commentContentArr"][n]["cid"])) #抓取评论cid
                comment.set_content(j_obj["commentContentArr"][n]["content"]) #抓取评论内容
                comment.set_user_name(j_obj["commentContentArr"][n]["userName"]) #抓取评论人用户名
                comment.set_quote_cid(int(j_obj["commentContentArr"][n]["quoteId"])) #抓取引用评论cid
                comment.set_layer(int(j_obj["commentContentArr"][n]["count"])) #抓取该评论楼层数
                userID = int(j_obj["commentContentArr"][n]["userID"]) #抓取评论人用户ID
                
                #热评高度,先置为0
                comment.set_height(0)
                
                #司机判断
                self.checkSIJI(comment)
                
                #删除判断
                self.checkDelete(comment, userID)
                
                #嘴炮标志,先写死
                comment.set_zuipao(0)
                
                #时间戳
                comment.set_check_time(str(datetime.datetime.now()))
                
                #数据下盘时间需要商量一下
                row.append(comment)
                
                #不能浪费太多时间在拥有超大评论量的投稿上
                if m > 3000:
                    flag = False
                    logging.error("over 3000, drop it.")
                    break
                
        except Exception as e:
            logging.error("commentContentArr is not exist")
            '''
            try:
                refresh_data.remove(acid)
            except:
                continue
            '''
            return 
            
        #analyseJsonB = time.time() #性能统计
        #print("解析评论json用时:" + str(analyseJsonB - analyseJsonA))
        
        #评论超过3K条时不参与分析
        #热门评论已经不准备抓取了
        '''
        if False:
            storeData = self.checkBest(row)
            for j, k in enumerate(storeData):
                if k[0] > 10 and k[0] < 30:
                    tmp = []
                    for i in range(0, k[0]):
                        po = ACcommentsStorePO()
                        po.set_cid(k[1])
                        po.set_name(k[k[0] - i + 1].get_name())
                        po.set_content(k[k[0] - i + 1].get_content())
                        tmp.append(po)
                    
                    for m, n in enumerate(row):
                        if int(n.get_cid()) == int(k[1]):
                            row[m].set_height(k[0])
                    
                    self.ACCommentsStore.insert(tmp)
        '''
        #self.ACComments.insert(row)
                
        #analyseTimeB = time.time() #性能统计
        #print("analyse用时:" + str(analyseTimeB - analyseTimeA))
        return row
Beispiel #4
0
 def load_data(self, data):
     row = ACcommentsPO()
     row.set_cid(int(data[0]))
     row.set_content(data[1])
     row.set_user_name(data[2])
     row.set_layer(int(data[3]))
     row.set_acid(int(data[4]))
     row.set_delete(int(data[5]))
     row.set_siji(int(data[6]))
     row.set_check_time(data[7])
     
     return row
Beispiel #5
0
    def analyse(self, src):
        #初始化一个row,不然极端情况下程序会崩溃
        row = []  #保存一篇投稿的评论
        strACid = int(src.get_id())
        acid = strACid
        #番剧的id小于0
        if acid > 0:
            url = "http://www.acfun.tv/comment_list_json.aspx?contentId=" + str(
                acid) + "&currentPage=1"
        else:
            url = 'http://www.acfun.tv/comment/bangumi/web/list?bangumiId=' + str(
                -acid) + '&pageNo=1'

        jsonContent = self.sendGet(url)
        if not self.checkURL(jsonContent):
            logging.warning("connect acfun comments fail")
            return

        try:
            j_obj = json.loads(jsonContent)
        except Exception:
            logging.warning("get acfun comments fail")
            return

        #番剧的id小于0
        try:
            if acid > 0:
                json_data = j_obj["commentContentArr"]
            else:
                json_data = j_obj['data']["commentContentArr"]
        except:
            logging.error("commentContentArr is not exist")
            return

        #偶尔会出现找不到commentContentArr的情况
        try:
            #开始解析json评论
            for m, n in enumerate(json_data):
                comment = ACcommentsPO()  #保存一条评论的内容
                comment.set_acid(int(acid))  #抓取投稿编号
                comment.set_cid(int(json_data[n]["cid"]))  #抓取评论cid
                comment.set_content(json_data[n]["content"])  #抓取评论内容
                comment.set_user_name(json_data[n]["userName"])  #抓取评论人用户名
                comment.set_layer(int(json_data[n]["count"]))  #抓取该评论楼层数
                userID = int(json_data[n]["userID"])  #抓取评论人用户ID

                #司机判断
                self.checkSIJI(comment)

                #删除判断
                self.checkDelete(comment, userID)

                #时间戳
                comment.set_check_time(str(datetime.datetime.now()))

                #数据下盘时间需要商量一下
                row.append(comment)

                #不能浪费太多时间在拥有超大评论量的投稿上
                if m > 3000:
                    logging.error("over 3000, drop it.")
                    break

        except Exception:
            logging.error("commentContentArr is not exist")
            return

        return row