def load_data(self, data): row = ACcommentsPO() row.set_cid(int(data[0])) row.set_content(data[1]) row.set_user_name(data[2]) row.set_layer(int(data[3])) row.set_acid(int(data[4])) row.set_delete(int(data[5])) row.set_siji(int(data[6])) row.set_check_time(data[7]) return row
def analyse(self, src): # 初始化一个row,不然极端情况下程序会崩溃 row = [] # 保存一篇投稿的评论 strACid = int(src.get_id()) acid = strACid # 番剧的id小于0 if acid > 0: url = "http://www.acfun.tv/comment_list_json.aspx?contentId=" + str(acid) + "¤tPage=1" else: url = "http://www.acfun.tv/comment/bangumi/web/list?bangumiId=" + str(-acid) + "&pageNo=1" jsonContent = self.sendGet(url) if not self.checkURL(jsonContent): logging.warning("connect acfun comments fail") return try: j_obj = json.loads(jsonContent) except Exception: logging.warning("get acfun comments fail") return # 番剧的id小于0 try: if acid > 0: json_data = j_obj["commentContentArr"] else: json_data = j_obj["data"]["commentContentArr"] except: logging.error("commentContentArr is not exist") return # 偶尔会出现找不到commentContentArr的情况 try: # 开始解析json评论 for m, n in enumerate(json_data): comment = ACcommentsPO() # 保存一条评论的内容 comment.set_acid(int(acid)) # 抓取投稿编号 comment.set_cid(int(json_data[n]["cid"])) # 抓取评论cid comment.set_content(json_data[n]["content"]) # 抓取评论内容 comment.set_user_name(json_data[n]["userName"]) # 抓取评论人用户名 comment.set_layer(int(json_data[n]["count"])) # 抓取该评论楼层数 userID = int(json_data[n]["userID"]) # 抓取评论人用户ID # 司机判断 self.checkSIJI(comment) # 删除判断 self.checkDelete(comment, userID) # 时间戳 comment.set_check_time(str(datetime.datetime.now())) # 数据下盘时间需要商量一下 row.append(comment) # 不能浪费太多时间在拥有超大评论量的投稿上 if m > 3000: logging.error("over 3000, drop it.") break except Exception: logging.error("commentContentArr is not exist") return return row
def analyse(self, src): #初始化一个row,不然极端情况下程序会崩溃 row = [] #保存一篇投稿的评论 strACid = str(src.get_url()) acid = strACid[strACid.find("/ac")+3:] acid = self.clear_acid(acid) url = "http://www.acfun.tv/comment_list_json.aspx?contentId=" + acid + "¤tPage=1" #urlCommentTimeA = time.time() #性能统计 jsonContent = self.sendGet(url) flag = True #urlCommentTimeB = time.time() #性能统计 #print("获取评论源代码用时:" + str(urlCommentTimeB - urlCommentTimeA)) if not self.checkURL(jsonContent): logging.warning("connect acfun comments fail") ''' try: refresh_data.remove(acid) except: continue ''' return try: j_obj = json.loads(jsonContent) except Exception as e: logging.warning("get acfun comments fail") ''' try: refresh_data.remove(acid) except: continue ''' return #偶尔会出现找不到commentContentArr的情况 try: #开始解析json评论 for m, n in enumerate(j_obj["commentContentArr"]): comment = ACcommentsPO() #保存一条评论的内容 comment.set_acid(int(acid)) #抓取投稿编号 comment.set_cid(int(j_obj["commentContentArr"][n]["cid"])) #抓取评论cid comment.set_content(j_obj["commentContentArr"][n]["content"]) #抓取评论内容 comment.set_user_name(j_obj["commentContentArr"][n]["userName"]) #抓取评论人用户名 comment.set_quote_cid(int(j_obj["commentContentArr"][n]["quoteId"])) #抓取引用评论cid comment.set_layer(int(j_obj["commentContentArr"][n]["count"])) #抓取该评论楼层数 userID = int(j_obj["commentContentArr"][n]["userID"]) #抓取评论人用户ID #热评高度,先置为0 comment.set_height(0) #司机判断 self.checkSIJI(comment) #删除判断 self.checkDelete(comment, userID) #嘴炮标志,先写死 comment.set_zuipao(0) #时间戳 comment.set_check_time(str(datetime.datetime.now())) #数据下盘时间需要商量一下 row.append(comment) #不能浪费太多时间在拥有超大评论量的投稿上 if m > 3000: flag = False logging.error("over 3000, drop it.") break except Exception as e: logging.error("commentContentArr is not exist") ''' try: refresh_data.remove(acid) except: continue ''' return #analyseJsonB = time.time() #性能统计 #print("解析评论json用时:" + str(analyseJsonB - analyseJsonA)) #评论超过3K条时不参与分析 #热门评论已经不准备抓取了 ''' if False: storeData = self.checkBest(row) for j, k in enumerate(storeData): if k[0] > 10 and k[0] < 30: tmp = [] for i in range(0, k[0]): po = ACcommentsStorePO() po.set_cid(k[1]) po.set_name(k[k[0] - i + 1].get_name()) po.set_content(k[k[0] - i + 1].get_content()) tmp.append(po) for m, n in enumerate(row): if int(n.get_cid()) == int(k[1]): row[m].set_height(k[0]) self.ACCommentsStore.insert(tmp) ''' #self.ACComments.insert(row) #analyseTimeB = time.time() #性能统计 #print("analyse用时:" + str(analyseTimeB - analyseTimeA)) return row
def analyse(self, src): #初始化一个row,不然极端情况下程序会崩溃 row = [] #保存一篇投稿的评论 strACid = int(src.get_id()) acid = strACid #番剧的id小于0 if acid > 0: url = "http://www.acfun.tv/comment_list_json.aspx?contentId=" + str( acid) + "¤tPage=1" else: url = 'http://www.acfun.tv/comment/bangumi/web/list?bangumiId=' + str( -acid) + '&pageNo=1' jsonContent = self.sendGet(url) if not self.checkURL(jsonContent): logging.warning("connect acfun comments fail") return try: j_obj = json.loads(jsonContent) except Exception: logging.warning("get acfun comments fail") return #番剧的id小于0 try: if acid > 0: json_data = j_obj["commentContentArr"] else: json_data = j_obj['data']["commentContentArr"] except: logging.error("commentContentArr is not exist") return #偶尔会出现找不到commentContentArr的情况 try: #开始解析json评论 for m, n in enumerate(json_data): comment = ACcommentsPO() #保存一条评论的内容 comment.set_acid(int(acid)) #抓取投稿编号 comment.set_cid(int(json_data[n]["cid"])) #抓取评论cid comment.set_content(json_data[n]["content"]) #抓取评论内容 comment.set_user_name(json_data[n]["userName"]) #抓取评论人用户名 comment.set_layer(int(json_data[n]["count"])) #抓取该评论楼层数 userID = int(json_data[n]["userID"]) #抓取评论人用户ID #司机判断 self.checkSIJI(comment) #删除判断 self.checkDelete(comment, userID) #时间戳 comment.set_check_time(str(datetime.datetime.now())) #数据下盘时间需要商量一下 row.append(comment) #不能浪费太多时间在拥有超大评论量的投稿上 if m > 3000: logging.error("over 3000, drop it.") break except Exception: logging.error("commentContentArr is not exist") return return row