def __storeurllist__(self, urllist, type=constant.SPIDER_S2_WEBSITE_VIDEO, referlist=[]): count = 0 index = 0 for url in urllist: params = PageBasicInfo() params.url = url params.type = type #检查是否在cold数据库中 #如果不在cold数据库中则插入hot数据库中 if not NewsStorage.exist_cold(url): NewsStorage.seturlinfos(params) #params = {constant.SPIDER_S2_WEBSITE_TYPE: type, #constant.SPIDER_CHANNEL: constant.SPIDER_CHANNEL_S1} #url = url.strip() #if not URLManager.getinstance().exist(url): #count += 1 #if referlist: #params[SiteS2Query.REFER_URL] = referlist[index] #urlcontext = URLContext() #urlcontext.url = url #urlcontext.type = URLContext.S1_MAIN_BODY #urlcontext.originalurl = url #urlcontext.customized = params #URLManager.getinstance().storeurl(url, urlcontext, constant.REQUEST_TYPE_WEBKIT) index += 1
def setclick(self, params): playcount = self.r.getid('play_count', params.content) votenum = self.r.getid('up', params.content) if playcount: NewsStorage.setclicknum(params.originalurl, playcount) if votenum: NewsStorage.setvotenum(params.originalurl, votenum)
def ifengnews_step2(self, params): try: oriurl = params.customized['oriurl'] jsoncontent = json.loads(params.content) clicknum = float(jsoncontent.get('join_count', '-1')) if clicknum > 0: NewsStorage.setclicknum(params.originalurl, clicknum) curcmtnum = float(jsoncontent['count']) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil( float(curcmtnum - dbcmtnum) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages # 拼出第一页之外的其他所有评论url for index in range(1, pages + 1, 1): if index == 1: self.ifengnews_step3(params) continue commentinfo_url = IfengNewsComments.COMMENTS_URL.format( oriurl=oriurl, pg=index, ps=self.page_size) self.storeurl(commentinfo_url, params.originalurl, IfengNewsComments.IFENG_NEWS_NEXT_PAGE) except: Logger.printexception()
def step2(self, params): try: Logger.getlogging().info("xinhuaComments.STEP_2") # 将STEP_1中的commentinfo_url传下来 newsId = params.customized['newsId'] comments_info = json.loads(params.content) comments_count = comments_info['totalRows'] NewsStorage.setcmtnum(params.originalurl, comments_count) page_count = comments_info['totalPage'] # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return # 判断增量 if page_count >= self.maxpages: page_count = self.maxpages for index in range(0, int(page_count)): commentinfo_url = xinhuaNewsComments.COMMENTS_URL_NEWS.format( newsId=newsId, pid=(index + 1)) self.storeurl(commentinfo_url, params.originalurl, xinhuaNewsComments.STEP_3) except: Logger.printexception()
def setp_3(self, params): # 取得评论件数 comments = json.loads(params.content) comments_count = float(comments['total']) NewsStorage.setcmtnum(params.originalurl, comments_count) if int(comments_count) == 0: return # 判断是否有增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 取得评论参数 cid = params.customized['cid'] xid = params.customized['xid'] pid = params.customized['pid'] # 取得评论url列表 for page in range(1, page_num + 1, 1): if page == 1: self.getcomments(params) continue url = LeComments.COMMENTS_URL % (cid, page, xid, pid) self.storeurl(url, params.originalurl, LeComments.STEP_4)
def setclick(self, params): soup = BeautifulSoup(params.content, 'html5lib') #电视剧 itemcount = soup.select('.mod_episode > .item') if itemcount: total = self.str2num( soup.select_one('#mod_cover_playnum').get_text()) clicknum = total / len(itemcount) NewsStorage.setclicknum(params.originalurl, clicknum) return #其他 parentid = params.originalurl.split('.')[-2].split('/')[-1] #figures_list = soup.find_all(attrs={'class':re.compile('^figures?_list$')}) for fitem in soup.find_all( attrs={'class': re.compile('^figures?_list$')}): #list_items = fitem.find_all(attrs={'class':re.compile('list_item')}) for item in fitem.find_all( attrs={'class': re.compile('list_item')}): childurl = item.select_one('a').get('href', None) childid = childurl.split('.')[-2].split('/')[-1] #Logger.getlogging().debug('childid:'+childid+'\t'+'parentid:'+parentid) if childid == parentid: numobj = item.find(attrs={ 'class': re.compile('num _video_playnum|figure_num') }) if not numobj: continue clicknum = self.str2num(numobj.get_text()) NewsStorage.setclicknum(params.originalurl, clicknum) return
def step1(self, params): try: url = params.originalurl videoId = params.customized['videoId'] params.content = params.content[params.content.index('{'):params.content.rindex('}')+1] jsonData = json.loads(params.content)['data'] hasCmts = jsonData['page']['count'] # 是否有评论 if not hasCmts: return # 比较增量:获取现在评论数目及上次采集的评论数目 currCmtsCount = jsonData['page']['acount'] NewsStorage.setcmtnum(url, currCmtsCount) prevCmtsCount = int(CMTStorage.getcount(url)) # 若没有评论更新,跳过 if prevCmtsCount >= currCmtsCount: return # 更新评论数 pageNum = int(math.ceil((hasCmts-prevCmtsCount)/self.pageSize)) # 上次采集数据的截止时间 # 生成分页url并传递给共通模块 for page in range(1, pageNum + 1): if page == 1: self.step2(params) pageUrl = self.pageUrl.format(page = page, videoId = videoId) self.storeurl(pageUrl, url, self.STEP_CMTS) except: Logger.printexception()
def getcomments_step2(self, params): bookId = params.customized['bookId'] xhtml = XPathUtility(html=params.content) page_counts = int(xhtml.xpath('//div[@class="page"]/@pagenum')[0]) comments_count = int(xhtml.xpath('//div[@class="page"]/@total')[0]) Logger.getlogging().debug(comments_count) if page_counts == 0: return # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages NewsStorage.setcmtnum(params.originalurl, comments_count) for page in range(1, page_num + 1, 1): comment_url = PubComments.COMMENTS_URL self.storeposturl(comment_url, params.originalurl, PubComments.STEP_3, { 'bookId': bookId, 'pageNum': page })
def step1(self, params): # 取得url中的id articleId = self.r.parse(r'^https://movie\.douban\.com/\w+/(\d+)', params.url)[0] # 取得评论件数 xpathobj = XPathUtility(params.content) text = xpathobj.getstring( xpath='//*[@id="comments-section"]//h2/*[@class="pl"]/a') numtext = self.r.parse('\d+', text) if not numtext: return curcmtnum = float(numtext[0]) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): url = doubanComments.COMMENTS_URL.format(articleId=articleId, start=(page - 1) * self.PAGE_SIZE, pagesize=self.PAGE_SIZE) self.storeurl(url, params.originalurl, doubanComments.STEP_2)
def setclick(self, params): try: content = json.loads(params.content) # content=[播放量,评论,X,X,弹幕,收藏数,投焦数,X] cmtnum = content[1] clicknum = content[0] votenum = content[-2] fansnum = content[-3] if not cmtnum: cmtnum = 0 if not clicknum: clicknum = 0 if not votenum: votenum = 0 if not fansnum: fansnum = 0 NewsStorage.seturlinfo(params.originalurl, data={ SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: cmtnum, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: clicknum, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM: votenum, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM: fansnum }) except: Logger.printexception()
def step2(self, params): qitanid = params.customized['qitanid'] tvid = params.customized['tvid'] comments = json.loads(params.content) curcmtnum = float(comments['data']['count']) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int( math.ceil(float(curcmtnum - dbcmtnum) / self.DEFAULT_PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): if int(qitanid): url = IqiyiComments.COMMENTS_URL1.format( pageno=page, pagesize=IqiyiComments.DEFAULT_PAGE_SIZE, qitanid=qitanid, tvid=tvid) else: url = IqiyiComments.COMMENTS_URL2.format( pageno=page, pagesize=IqiyiComments.DEFAULT_PAGE_SIZE, tvid=tvid) self.storeurl(url, params.originalurl, IqiyiComments.STEP_3)
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is jiemianComments.STEP_1: # 取得url中的id articleId = re.findall(r'^http://www\.jiemian\.com/\w+/(\d+)', proparam.url).__getitem__(0) # 设置clicknum self.setclick(proparam) # 取得评论个数 comments_count = float(re.findall(r'"comment_count">(\d+)</span>', proparam.content).__getitem__(0)) if comments_count: NewsStorage.setcmtnum(proparam.originalurl, comments_count) # 取得评论件数 if int(comments_count) == 0: return # 增量判断 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if cmtnum >= comments_count: return page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 循环取得评论的url for page in range(1, page_num + 1, 1): url = jiemianComments.COMMENTS_URL % (articleId, page) self.storeurl(url, proparam.originalurl, jiemianComments.STEP_3) elif proparam.step == jiemianComments.STEP_3: # proparam.content = proparam.content.replace('\\','') # soup = BeautifulSoup(proparam.content, 'html5lib') # items = soup.select('.comment-post') # for item in items: # content = item.select_one('.comment-main > p').get_text().encode('utf-8') # curtime = TimeUtility.getuniformtime(item.select_one('.date').get_text()) # nick = item.select_one('.author-name').get_text().decode('utf-8').encode('utf-8') # 取得点赞数 votenum = self.r.getid('ding', proparam.content) if votenum == '': Logger.getlogging().debug("Unable to get playcount") else: NewsStorage.setvotenum(proparam.originalurl, votenum) # 取得评论的正则表达式 comments = re.findall(r'<p>(.+?)<\\/p>', proparam.content) ctime = re.findall(r'<span class=\\"date\\">(.+?)<\\/span>',proparam.content) nicks = re.findall(r'class=\\"author-name\\">(.+?)<\\/a>', proparam.content) # 取得评论 for index in range(0,len(comments)): time = ctime[index].replace('\\', '') curtime = TimeUtility.getuniformtime(time) content = eval('u"' + comments[index] + '"').encode('utf-8') nick = eval('u"' + nicks[index] + '"').encode('utf-8') if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()
def step1(self, params): pattern = 'https://www.huxiu.com/article/(\d+).html' if not self.r.search(pattern, params.originalurl): Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_TEMPLATE) return else: object_id = self.r.parse(pattern, params.originalurl)[0] curcmtnum = XPathUtility( params.content).getnumber('//*[@class="article-pl pull-left"]') if not curcmtnum: Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_COMMNETS) return NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1): #self.POST_DATA['object_id'] = object_id #self.POST_DATA['page'] = page #self.storeposturl(self.POST_URL, params.originalurl, HuxiupostComments.EACH, self.POST_DATA) commonurl = self.COMMONURL.format(object_id=object_id, page=page) self.storeurl(commonurl, params.originalurl, HuxiupostComments.EACH)
def step2(self, params): """获取评论的其他url""" try: comments = json.loads(params.content) topic_id = comments['topic_id'] curcmtnum = float(comments.get('cmt_sum', -1)) #clicknum = float(comments.get('participation_sum',-1)) NewsStorage.setcmtnum(params.originalurl, curcmtnum) #NewsStorage.setclicknum(params.originalurl, clicknum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return page_num = int( math.ceil(float(curcmtnum - dbcmtnum) / self.page_size)) if page_num >= self.maxpages: page_num = self.maxpages for page in range(1, page_num + 1): if self.r.search('http[s]{0,1}://.*tv\.sohu.com/.*', params.originalurl): url = self.COMMENTS_URL.format(self.tv_client_id, topic_id, page, self.tv_page_size) else: url = self.COMMENTS_URL.format(self.client_id, topic_id, page, self.page_size) self.storeurl(url, params.originalurl, self.STEP_COMMENT_NEXT_PAGE) except: Logger.printexception() Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
def step2_ebook(self, params): try: #"""只适用在QQ阅读部分,获取评论的url列表""" bid = params.customized['bid'] jsoncontent = json.loads(params.content) if not jsoncontent.has_key('data'): Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return comments_count = jsoncontent['data']['total'] page_count = jsoncontent['data']['pageCount'] # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) NewsStorage.setcmtnum(params.originalurl, comments_count) if cmtnum >= comments_count: return # 判断10页 if int(page_count) >= self.maxpages: page_count = self.maxpages for page in range(1, page_count + 1, 1): commentinfo_url = self.EBOOK_COMMENTS_URL.format(site='intro', bid=bid, page=page) self.storeurl(commentinfo_url, params.originalurl, self.STEP_COMMENT_NEXT_PAGE) except Exception, e: Logger.printexception()
def step2bbs(self, params): Logger.getlogging().info("JoyComments.STEP_2") topic_id = params.customized['topic_id'] domain = params.customized['domain'] try: commentsinfo = json.loads(params.content) comments_count = commentsinfo['result']['mainreplys']['page'][ 'totalRows'] NewsStorage.setcmtnum(params.originalurl, comments_count) except: Logger.getlogging().warning( '{url} Errorcode:40000'.format(url=params.originalurl)) #Logger.printexception() return # 保存页面评论量 cmtnum = CMTStorage.getcount(params.originalurl, True) # 计算增量 if cmtnum >= comments_count: return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages for index in range(1, page_num + 1, 1): commentinfo_url = JoyComments.COMMENT_URL.format(topic_id=topic_id, domain=domain, page=index) self.storeurl(commentinfo_url, params.originalurl, JoyComments.STEP_3_BBS)
def get_url_id(self, params): """只适用在腾讯视频的部分""" "cid是电视剧\合集\电影,vid单集" CID_PATTERN = 'https?://v\.qq\.com/x/cover/(\w+).html' CID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&cid={cid}' VID_PATTERN1 = 'https?://v\.qq\.com/x/cover/\w+/(\w+).html' VID_PATTERN2 = 'https?://v\.qq\.com/x/page/(\w+)\.html' VID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&vid={vid}' if self.r.search(CID_PATTERN, params.originalurl): cid = self.r.parse(CID_PATTERN, params.originalurl)[0] url = CID_URL.format(cid=cid) self.storeurl(url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) elif self.r.search(VID_PATTERN1, params.originalurl): vid = self.r.parse(VID_PATTERN1, params.originalurl)[0] url = VID_URL.format(vid=vid) self.storeurl(url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) elif self.r.search(VID_PATTERN2, params.originalurl): vid = self.r.parse(VID_PATTERN2, params.originalurl)[0] url = VID_URL.format(vid=vid) self.storeurl(url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) #publish_date publish_date = self.r.getid('publish_date', params.content, split=':') if not publish_date: publish_date = XPathUtility(params.content).getstring( '//*[@class="video_tags"]/span|//*[@class="date"]|//*[@class="tag_item"]' ) publish_date = TimeUtility.getuniformtime(publish_date) if publish_date: NewsStorage.setpublishdate(params.originalurl, publish_date) self.setclick(params)
def step2(self, params): """""" try: threadId = params.customized['threadId'] jsondata = json.loads(params.content) comment_totalnum = jsondata['tcount'] NewsStorage.setcmtnum(params.originalurl, comment_totalnum) except: Logger.getlogging().warning('{}:30000 No comments'.format( params.originalurl)) return #更新数据库 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= int(comment_totalnum): return max = int( math.ceil(float(comment_totalnum - cmtnum) / VComments.limit)) if max > self.maxpages: max = self.maxpages #if offsets == 1: #self.step3(params) for offset in range(1, max + 1, 1): if page == 1: self.step3(params) continue comment_url = VComments.COMMENT_URL.format(threadId=threadId, limit=VComments.limit, offset=offset * VComments.limit) self.storeurl(comment_url, params.originalurl, VComments.V_STEP_3, {'threadId': threadId})
def step1(self, params): Logger.getlogging().info("ToutiaoNewsComments.STEP_1") group_id = self.r.parse('http://www.toutiao.com/(\w+)/.*',params.originalurl)[0] if group_id: group_id = group_id[1:] # group_id = self.r.getid("groupId", params.content) # item_id = self.r.getid("itemId", params.content) # if not group_id: # group_id = self.r.getid("group_id", params.content) # item_id = self.r.getid("item_id", params.content) try: publishdate = self.r.getid("time", params.content) if not publishdate: publishdate = self.r.getid("publish_time", params.content) if publishdate: NewsStorage.setpublishdate(params.originalurl,publishdate) except: Logger.getlogging().error('{0}:30000'.format(params.originalurl)) if not group_id : Logger.getlogging().error('{0}:30000'.format(params.originalurl)) return # commentinfo_url = ToutiaoNewsComments.COMMENTS_URL.format(group_id, item_id, 1, self.page_size) commentinfo_url = ToutiaoNewsComments.COMMENTS_URL.format(group_id, 0, self.page_size) self.storeurl(commentinfo_url, params.originalurl, ToutiaoNewsComments.STEP_2,{'group_id':group_id})
def getComments(self, params, url): # 当前评论页码 pg = self.r.parse(url, params.url)[0] soup = BeautifulSoup(params.content, 'html5lib') # 帖子内容 infos = soup.select('tr > td.postcontent') # 发表时间,内容格式[发表于 2016-10-7 18:04:25] comments = [] # 第一页的第一条内容为正文 if pg == '1': start = 1 else: start = 0 for info in infos[start:]: # 取主评论 if info.select_one('div[class="postmessage defaultpost"]'): content = info.select_one('div[class="postmessage defaultpost"]').get_text()\ .replace('\t','').replace('\n','').replace(' ','').strip() updatetime = info.select_one( 'div.postinfo > font').get_text().strip()[4:] + ':00' curtime = getuniformtime(updatetime) nick = 'none' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) comments_couts = CMTStorage.getcount(params.originalurl) NewsStorage.setcmtnum(params.originalurl, comments_couts)
def step2(self, params): operaId = params.customized['operaId'] contentId = params.customized['contentId'] # 获取评论的Jason返回值 comments = json.loads(params.content) # 获取评论页数 curcmtnum = int(comments['pageTurn']['rowCount']) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages # 循环拼接评论url,提交下载平台获取评论数据 for page in range(1, pages + 1, 1): if page == 1: self.step3(params) commentUrl = Comments.COMMENTS_URL % (operaId, contentId, page, Comments.PAGE_SIZE) self.storeurl(commentUrl, params.originalurl, Comments.STEP_3, { 'operaId': operaId, 'contentId': contentId })
def process(self, params): # S2 Query Process if SPIDER_CHANNEL_S2 == SpiderConfigure.getinstance().getchannel(): if SPIDER_S2_WEBSITE_TYPE not in params.customized: return True xparser = XPathUtility(params.content) maxitmes = 0 pageinfo = PageBasicInfo() template = None for template in TemplateManager.getxpaths(params.url): Logger.getlogging().debug('URL_TEMPLATE {url}\t{template}'.format( url=params.url, template=template[TemplateManager.XPATH_KEY_URL_TEMPLATE])) pageinfo, items = self.parsefromcontent(params, template, xparser) if constant.SPIDER_S2_WEBSITE_TYPE in params.customized: pageinfo.type = params.customized[ constant.SPIDER_S2_WEBSITE_TYPE] #if not params.page_title and not pageinfo.title and not params.lastretry: #return False if template is None: Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_TEMPLATE) #值覆盖 pageinfo.url = params.url if not pageinfo.title: pageinfo.title = params.page_title if not pageinfo.body: pageinfo.body = params.page_body if not pageinfo.pubtime: pageinfo.pubtime = params.html_time NewsStorage.seturlinfos(pageinfo)
def setp_2(self, params): # 取得评论件数 comments = json.loads(params.content) comments_count = float(comments['total']) NewsStorage.setcmtnum(params.originalurl, comments_count) if int(comments_count) == 0: return # 判断是否有增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 取得评论参数 pid = params.customized['pid'] # 综艺模式取得评论url if re.match(r'^http://zongyi\.le\.com/.*', params.url): for page in range(1, page_num + 1, 1): if page == 1: self.geturlcomments(params) continue url = LeComments.COMMENTS_URL_ZONGYI1 % (page, pid) self.storeurl(url, params.originalurl, LeComments.STEP_4) else: for page in range(1, page_num + 1, 1): if page == 1: self.geturlcomments(params) continue url = LeComments.COMMENTS_URL_TV % (page, pid) self.storeurl(url, params.originalurl, LeComments.STEP_4)
def step2(self, params): jsondata = json.loads(params.content) if 'thread' not in jsondata: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return threadid = jsondata['thread']['thread_id'] curcmtnum = int(jsondata['cursor']['total']) # 检查是否有评论数,没有,返回 if int(curcmtnum) == 0: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return # 检查评论数是否增加,没有增加,返回;有增加,更新增加后的页面评论量 curcmtnum = int(curcmtnum) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return pages = int( math.ceil(float(curcmtnum - dbcmtnum) / self.DEFAULT_PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): url = NarutomVideoComments.COMMENTS_URL.format( threadid=threadid, limit=NarutomVideoComments.DEFAULT_PAGE_SIZE, page=page) self.storeurl(url, params.originalurl, NarutomVideoComments.STEP_3)
def getclick(self, params): pattern = 'https?://\w+\.le\.com.*/\w+/(\d+)\.html' if re.search(pattern, params.originalurl): if self.r.search(pattern, params.originalurl): vid = self.r.parse(pattern, params.originalurl)[0] playcount_url = self.PALYCOUNT_URL.format(vid=vid) self.storeurl(playcount_url, params.originalurl, LeComments.STEP_PALY) if NewsStorage.getpublishdate( params.originalurl) == TimeUtility.getintformtime(0): if self.r.search('https?://sports\.le\.com/video/\d+\.html', params.originalurl): #仅针对体育频道获取发布时间 pubTime = XPathUtility( params.content).getstring('//*[@class="live-vedio-infor"]') publishdate = TimeUtility.getuniformtime(publishdate) NewsStorage.setpublishdate(params.originalurl, publishdate) else: #仅针对综艺频道获取发布时间 title = XPathUtility(params.content).getstring( '//h1[@class="j-video-name video-name"]') if title: if re.search('\d{8}', title): publishdate = re.findall('\d{8}', title)[0] NewsStorage.setpublishdate(params.originalurl, publishdate)
def step2(self, params): # 取得client_id liteloadApi = params.customized['liteloadApi'] client_id = params.customized['client_id'] topic_url = params.customized['topic_url'] commentsApi = params.customized['commentsApi'] # 取得评论个数 content = json.loads(params.content) curcmtnum = float(content.get('cmt_sum',0)) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 取得topicId topic_id = content.get('topic_id','') if not topic_id: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_XPATHVALUE) return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / ChangyanComments.PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): # 取得评论的url #COMMENTS_URL = 'http://changyan.sohu.com/api/{commentsApi}/topic/comments?client_id={client_id}&page_no={page_no}&page_size={page_size}&topic_id={topic_id}' url = ChangyanComments.COMMENTS_URL.format(commentsApi=commentsApi, client_id=client_id, page_no = page, page_size = ChangyanComments.PAGE_SIZE, topic_id=topic_id, ) self.storeurl(url, params.originalurl, ChangyanComments.STEP_3)
def step2_ifeng_xiaobg(self, params): try: jsoncontent = json.loads(params.content) clicknum = float(jsoncontent.get('join_count', '-1')) if clicknum > 0: NewsStorage.setclicknum(params.originalurl, clicknum) curcmtnum = jsoncontent['count'] NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil( float(curcmtnum - dbcmtnum) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages for index in range(1, pages + 1, 1): if index == 1: self.ifengnews_step3(params) continue self.post_data['p'] = index self.storeposturl(self.post_url, params.originalurl, self.IFENG_NEWS_NEXT_PAGE, IfengNewsComments.post_data) except: Logger.printexception()
def step2_ac(self, params): """只适用在腾讯动漫视频部分,获取评论的url列表""" url_id = params.customized['url_id'] xhtml = etree.HTML(params.content) # 评论数量获取经常会参数错误 comments_count = xhtml.xpath( '//*[@id="pagination-node"]/span/em/text()') if comments_count: comments_count = int(comments_count[0]) else: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return page_size = len(xhtml.xpath('//*[@class="comment-content-detail"]')) # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) NewsStorage.setcmtnum(params.originalurl, comments_count) if cmtnum >= comments_count: return page_num = int(math.ceil((float(comments_count) / page_size))) if int(page_num) >= self.maxpages: page_num = self.maxpages for page in range(1, page_num + 1): url = self.AC_COMMENTS_URL.format(url_id, page) self.storeurl(url, params.originalurl, self.STEP_COMMENT_NEXT_PAGE)
def step2news(self, params): curcmtnum = self.r.parse("\"total\"\:(\d+)", params.content)[0] page_count = self.r.parse("\"total_page\"\:(\d+)", params.content)[0] threadid = self.r.parse("\"thread_id\"\:\"(\d+)\"", params.content)[0] objectid = self.r.parse("\"object_id\"\:\"(\d+)\"", params.content)[0] curcmtnum = int(curcmtnum) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): if page == 1: self.step3news(params) continue comment_url = U17NewsComments.COMMENT_URL_NEWS.format( threadid=threadid, objectid=objectid, page=page, pagesize=self.page_size, comicid=objectid) self.storeurl(comment_url, params.originalurl, U17NewsComments.STEP_3)
def step1(self, params): Logger.getlogging().info("MkzhanComments.STEP_1") # # 取得html中的commentType # comment_type = self.r.getid('commentType', params.content) # # # 取得html中的aboutid # aboutid = self.r.getid('aboutid', params.content) # if not comment_type or not aboutid: # Logger.getlogging().warning('{url}:40000 No commentType or No aboutid'.format(url=params.originalurl)) # return if NewsStorage.getclicknum(params.originalurl) <= 0: if self.r.search('<span>人气:\s<b>(.*?)<\/b>', params.content): clicknum = self.r.parse('<span>人气:\s<b>(.*?)<\/b>', params.content)[0] NewsStorage.setclicknum(params.originalurl, clicknum) # 获取comic_id comic_id = int( self.r.parse(r'^http[s]?://www\.mkzhan\.com/(\d+)/.*', params.originalurl)[0]) if not comic_id: return # 取得评论url comments_url = MkzhanComments.COMMENTS_URL % (comic_id, 1, self.PAGE_SIZE) self.storeurl(comments_url, params.originalurl, MkzhanComments.STEP_2, {'comic_id': comic_id})