def step3(self, params): """通过评论的url获取评论""" #相对之前的版本,本次更新变动: #comments存储的接口为CMTStorage.storecmt(),参数为originalurl, 评论内容, 评论发布时间, 用户 #存储的内容增加了 评论发布时间, 用户 try: jsondata = json.loads(params.content) if jsondata['comments']: for comment in jsondata['comments']: content = comment['content'] curtime = TimeUtility.getuniformtime( comment['create_time']) nick = comment['passport']['nickname'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) reply = comment['comments'] while reply: for comment in comment['comments']: content = comment['content'] curtime = TimeUtility.getuniformtime( comment['create_time']) nick = comment['passport'].get( 'nickname', 'anonymous') if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) reply = comment['comments'] except: Logger.printexception() Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
def process(self, params): try: if params.step is ThepaperComments.STEP_1: # 根据url获取拼接评论的参数 contid = params.originalurl.split('_') contid = contid[-1] # 拼接初始评论url comments_url = ThepaperComments.SOURCE_COMMENTS_URL.format( contid=contid) # 通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, ThepaperComments.STEP_2, {'contid': contid}) elif params.step == ThepaperComments.STEP_2: contid = params.customized['contid'] soup = BeautifulSoup(params.content, 'html5lib') divs = soup.find_all(attrs={ 'id': re.compile('comment'), 'class': 'comment_que' }) if not divs: return if self.r.search(ur'startId=(.*)', params.url): for index in range(1, len(divs), 1): tm = divs[index].select_one( '.aqwright > h3 > span').get_text() curtime = getuniformtime(tm) content = divs[index].select_one( '.aqwright > .ansright_cont > a').get_text() nick = divs[index].select_one( '.aqwright > h3 > a').get_text() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) else: for index in range(0, len(divs), 1): tm = divs[index].select_one( '.aqwright > h3 > span').get_text() curtime = getuniformtime(tm) content = divs[index].select_one( '.aqwright > .ansright_cont > a').get_text() nick = divs[index].select_one( '.aqwright > h3 > a').get_text() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) if self.r.search(ur'startId=(.*)', params.url): hotIds = params.customized['hotIds'] else:
def process(self, params): try: if params.step is AllComments.STEP_1: try: action = self.r.parse('\"id\":(\d+)',params.content)[0] #action = action.replace("rewards","comments") comments_url = AllComments.COMMENTS_URL.format(id=action,page=1) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'action':action,'page':1}) except: return elif params.step is AllComments.STEP_2: soup = BeautifulSoup(params.content,'html5lib') divs = soup.find_all(attrs={'id':re.compile('comment'),'class':re.compile('note-comment')}) if not divs: return for div in divs: #print ''.join(div.get_text().split()) content = div.select_one('div.content > p').get_text() pubtime = div.select_one('div.content > .meta-top > .reply-time > a').get_text() curtime = TimeUtility.getuniformtime(pubtime) if not CMTStorage.exist(params.originalurl, content, curtime, ''): CMTStorage.storecmt(params.originalurl, content, curtime, '') child_divs = div.select('.content > .child-comment-list > .child-comment') if child_divs: #child_divs = soup.find_all(attrs={'id':re.compile('comment')}) for item in child_divs: comment = item.select_one('p').get_text() content = ''.join(comment.split()) replytime = item.select_one('.reply-time > a').get_text() curtime = TimeUtility.getuniformtime(replytime) if not CMTStorage.exist(params.originalurl, content, curtime, ''): CMTStorage.storecmt(params.originalurl, content, curtime, '') # self.commentstorage.store(params.originalurl, comments) comments_url = AllComments.COMMENTS_URL.format(id=params.customized['action'], page = int(params.customized['page'])+1) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'action':params.customized['action'], 'page':int(params.customized['page'])+1}) except: Logger.printexception()
def step3bbs(self, params): Logger.getlogging().info("JoyComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 try: commentsinfo = json.loads(params.content) commentsinfo['result']['mainreplys']['rows'] except: Logger.getlogging().warning( '{url} Errorcode:40000'.format(url=params.originalurl)) Logger.printexception() return # 获取评论 for index in range( 0, int(len(commentsinfo['result']['mainreplys']['rows'])), 1): # 提取时间 # cmti = CommentInfo() content = commentsinfo['result']['mainreplys']['rows'][index][ 'reply']['reply']['body']['text'] curtime = TimeUtility.getuniformtime( str(commentsinfo['result']['mainreplys']['rows'][index] ['reply']['reply']['post_time'])) nick = commentsinfo['result']['mainreplys']['rows'][index][ 'reply']['user']['name'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is jiemianComments.STEP_1: # 取得url中的id articleId = re.findall(r'^http://www\.jiemian\.com/\w+/(\d+)', proparam.url).__getitem__(0) # 设置clicknum self.setclick(proparam) # 取得评论个数 comments_count = float(re.findall(r'"comment_count">(\d+)</span>', proparam.content).__getitem__(0)) if comments_count: NewsStorage.setcmtnum(proparam.originalurl, comments_count) # 取得评论件数 if int(comments_count) == 0: return # 增量判断 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if cmtnum >= comments_count: return page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 循环取得评论的url for page in range(1, page_num + 1, 1): url = jiemianComments.COMMENTS_URL % (articleId, page) self.storeurl(url, proparam.originalurl, jiemianComments.STEP_3) elif proparam.step == jiemianComments.STEP_3: # proparam.content = proparam.content.replace('\\','') # soup = BeautifulSoup(proparam.content, 'html5lib') # items = soup.select('.comment-post') # for item in items: # content = item.select_one('.comment-main > p').get_text().encode('utf-8') # curtime = TimeUtility.getuniformtime(item.select_one('.date').get_text()) # nick = item.select_one('.author-name').get_text().decode('utf-8').encode('utf-8') # 取得点赞数 votenum = self.r.getid('ding', proparam.content) if votenum == '': Logger.getlogging().debug("Unable to get playcount") else: NewsStorage.setvotenum(proparam.originalurl, votenum) # 取得评论的正则表达式 comments = re.findall(r'<p>(.+?)<\\/p>', proparam.content) ctime = re.findall(r'<span class=\\"date\\">(.+?)<\\/span>',proparam.content) nicks = re.findall(r'class=\\"author-name\\">(.+?)<\\/a>', proparam.content) # 取得评论 for index in range(0,len(comments)): time = ctime[index].replace('\\', '') curtime = TimeUtility.getuniformtime(time) content = eval('u"' + comments[index] + '"').encode('utf-8') nick = eval('u"' + nicks[index] + '"').encode('utf-8') if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()
def step3news(self, params): Logger.getlogging().info("ZolbbsComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="commli"]/p') commentstime = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="published-time"]') commentsnick = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="user-name"]') # 获取评论,设置实际的评论量 for index in range(0, len(commentstime), 1): # 提取时间 tm = commentstime[index].strip() try: curtime = TimeUtility.getuniformtime(getuniformtime(tm), u'%Y-%m-%d %H:%M') except Exception, e: curtime = getuniformtime(tm) # 提取评论内容 content = commentsinfo[index] nick = commentsnick[index] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def geturlcomments(self, proparam): # soup = BeautifulSoup(proparam.content, 'html5lib') # lis = soup.select('.comment-say') # for li in lis: # content = li.select_one('.des').get_text() # curtime = li.select_one('.time').get_text() # nick = li.select_one('.name replyName').get_text() # if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): # CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) # 取得评论的正则表达式 comments = re.findall(r'content":"(.+?)","paragraph_id"', proparam.content) commentsTime = self.r.parse( r'origin_created":"(\d+)","member_avatarPath"', proparam.content) nicks = self.r.parse(r'"nickname":"(.*?)","is_hot"', proparam.content) # 取得评论 index = 0 for comment in comments: comment = eval('u"' + comment + '"') content = comment.encode('utf-8') curtime = TimeUtility.getuniformtime(commentsTime[index]) nick = eval('u"' + nicks[index] + '"') nick = nick.encode('utf-8') if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) index = index + 1
def step2(self, params): Logger.getlogging().info("MkzhanComments.STEP_2") comic_id = params.customized['comic_id'] # aboutid = params.customized['aboutid'] comments = json.loads(params.content) comments_count = int(comments['data']['count']) cmtnum = CMTStorage.getcount(params.originalurl, True) # 获取第一页的内容 for it in comments['data']['list']: content = it['content'] curtime = TimeUtility.getuniformtime(it['create_time']) nick = it['username'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) # 设置cmtnum NewsStorage.setcmtnum(params.originalurl, comments_count) if cmtnum >= comments_count: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=params.originalurl)) return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages for page in range(2, page_num + 1, 1): comments_url = MkzhanComments.COMMENTS_URL % (comic_id, page, self.PAGE_SIZE) self.storeurl(comments_url, params.originalurl, MkzhanComments.STEP_3)
def getComments(self, params, url): # 当前评论页码 pg = self.r.parse(url, params.url)[0] soup = BeautifulSoup(params.content, 'html5lib') # 帖子内容 infos = soup.select('tr > td.postcontent') # 发表时间,内容格式[发表于 2016-10-7 18:04:25] comments = [] # 第一页的第一条内容为正文 if pg == '1': start = 1 else: start = 0 for info in infos[start:]: # 取主评论 if info.select_one('div[class="postmessage defaultpost"]'): content = info.select_one('div[class="postmessage defaultpost"]').get_text()\ .replace('\t','').replace('\n','').replace(' ','').strip() updatetime = info.select_one( 'div.postinfo > font').get_text().strip()[4:] + ':00' curtime = getuniformtime(updatetime) nick = 'none' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) comments_couts = CMTStorage.getcount(params.originalurl) NewsStorage.setcmtnum(params.originalurl, comments_couts)
def geturlcomments(self, params): xparser = XPathUtility(params.content) # 取回所有评论 page = params.customized['page'] if page == 1: commentstimes = xparser.getcomments( '//table[position()>1]/tbody/tr/td/span[1]') commentscontents = xparser.getcomments( '//table[position()>1]/tbody/tr[2]/td[@class="post-main"]') commentsnicks = xparser.getcomments('//*[@class="name"]/a') else: commentstimes = xparser.getcomments('//table/tbody/tr/td/span[1]') commentscontents = xparser.getcomments( '//table/tbody/tr[2]/td[@class="post-main"]') commentsnicks = xparser.getcomments('//*[@class="name"]/a') # 设置实际的评论量 for index in range(0, len(commentscontents), 1): curtime = TimeUtility.getuniformtime(commentstimes[index][4:]) # 提取评论内容 content = commentscontents[index].strip() nick = commentsnicks[index].strip() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def step3_ebook(self, params): try: jsoncontent = json.loads(params.content) if not jsoncontent.has_key('data'): return html = jsoncontent['data']['listHtml'] if not html: return soup = BeautifulSoup(html, 'lxml') divs = soup.select('div.cf') if not divs: return for div in divs: # commentList > dl:nth-child(1) > div.cf > dd > p:nth-child(2) content = div.select('dd > p')[1].get_text() curtime = TimeUtility.getuniformtime( div.select('dd > p')[0].get_text().split('|')[-1]) nick = div.select('dd > p')[0].get_text().split('|')[0] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: Logger.printexception()
def common_step3(self, proparam): # 网易非云阅读处理 try: commentsinfo = json.loads(proparam.content) except: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=proparam.originalurl)) return #commentsinfo = json.loads(proparam.content) comments = [] # 获取评论 key_comments = 'comments' if key_comments in commentsinfo: for key in commentsinfo[key_comments].keys(): try: nickname = commentsinfo[key_comments][key]['user'][ 'nickname'] except: nickname = 'anonymous' if CMTStorage.exist( proparam.originalurl, commentsinfo[key_comments][key]['content'], commentsinfo[key_comments][key]['createTime'], nickname): CMTStorage.storecmt( proparam.originalurl, commentsinfo[key_comments][key]['content'], commentsinfo[key_comments][key]['createTime'], nickname) else: break
def bbs_step3(self, params): try: xparser = XPathUtility(params.content) page = params.customized['page'] pagecount = params.customized['pagecount'] comments = [] updatetimes = [] nicks = [] contents = xparser.getcomments('//*[@class="read"]') mid_times = xparser.getlist('//td[@class="authorname"]') for times in mid_times: updatetimes.append(self.r.parse(ur'于(\d+-\d+-\d+ \d+:\d+:\d+)留言', times)[0]) nicks.append(self.r.parse(ur'(.*)于', times)[0]) if page == 0: mid_index = 1 elif page > 0: mid_index = 0 comments_number = xparser.getnumber('//*[@id="msgsubject"]/font') if comments_number != 0: for index in range(mid_index, len(contents), 1): curtime = TimeUtility.getuniformtime(updatetimes[index]) content = contents[index] nick = nicks[index].split('于')[0].split('☆')[-1] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: traceback.print_exc()
def getcomments(self, params): docid = params.customized['docid'] jsondata = json.loads(params.content) if len(jsondata['comments']) > 0: index = 0 for comment in jsondata['comments']: commentid = comment['comment_id'] content = comment['comment'] curtime = comment['createAt'] nick = comment['nickname'] if index == len(jsondata['comments']) - 1: last_comment_id = commentid if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) index += 1 comment_source_url = self.COMMENTS_URL.format( docid, self.page_size, last_comment_id) self.storeurl(comment_source_url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE, { 'docid': docid, 'last_comment_id': last_comment_id }) else: return
def process_book(self, params): try: if params.step == Comments.STEP_1: # 从url中获取拼接评论url的参数 bookId = self.r.parse('^http://www\.17k\.com/book/(\w+).html$', params.originalurl)[0] # 拼接第一页评论url comments_url = Comments.COMMENTS_URL % (bookId, 1, Comments.PAGE_SIZE) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, Comments.STEP_2, {'bookId': bookId}) #获取第一页评论内容,循环获取全部评论url elif params.step == Comments.STEP_2: bookId = params.customized['bookId'] # 获取评论的Jason返回值 comments = json.loads(params.content) comments_count = int(comments['page']['count']) # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl) if cmtnum >= comments_count: return NewsStorage.setcmtnum(params.originalurl, comments_count) # 获取评论最后更新时间 lasttime = CMTStorage.getlastpublish(params.originalurl, True) # 获取评论页数 page_count = int(comments['page']['pagecount']) if page_count == 0: return if page_count >= self.maxpages: page_count = self.maxpages # 循环拼接评论url,提交下载平台获取评论数据 for page in range(1, page_count + 1, 1): commentUrl = Comments.COMMENTS_URL % (bookId, page, Comments.PAGE_SIZE) self.storeurl(commentUrl, params.originalurl, Comments.STEP_3, {'bookId': bookId}) #解析评论数据 elif params.step == Comments.STEP_3: commentsinfo = json.loads(params.content) for comment in commentsinfo['page']['result']: curtime = TimeUtility.getuniformtime( comment['creationDate']) content = comment['summary'] nick = comment['marks']['nikeName'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: traceback.print_exc()
def geturlcomemnts(self, params): Logger.getlogging().debug(params.originalurl) jsondata = json.loads(params.content) for it in jsondata['data']['comment_data']: content = Common.urldec(it['cms_body']).decode(CHARSET_UTF8) content = self.filterstr(content) Logger.getlogging().debug(content) if not CMTStorage.exist(params.originalurl, content, it['cms_pubdate'], it['uname']): CMTStorage.storecmt(params.originalurl, content, it['cms_pubdate'], it['uname'])
def step3(self, params): Logger.getlogging().info("MkzhanComments.STEP_3") comments = json.loads(params.content) for it in comments['data']['list']: content = it['content'] curtime = TimeUtility.getuniformtime(it['create_time']) nick = it['username'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def getcomments(self, params): comments = json.loads(params.content) # 获取评论 for item in comments['data']: curtime = TimeUtility.getuniformtime(item['ctime']) content = item['content'] nick = item['user']['username'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def getcontents(self, params): soup = BeautifulSoup(params.content, 'html5lib') lis = soup.select('.all-post > .post-wrap') for li in lis: content = li.select_one('.post-body > a').get_text() curtime = li.select_one('.mr20').get_text() nick = li.select_one('.post-auther > a').get_text() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def process(self, params): try: if params.step is None: # 从url中获取拼接评论url的参数 oid = self.r.parse('^http://v\.ku6\.com/show/([\w-]+..).html', params.originalurl)[0] # 拼接第一页评论url comments_url = Ku6Comments.COMMENTS_URL % (oid, 1, 1) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, Ku6Comments.STEP_2, {'oid': oid}) #获取第一页评论内容,循环获取全部评论url elif params.step == Ku6Comments.STEP_2: oid = params.customized['oid'] # 获取评论的Jason返回值 comments = json.loads(params.content) # 获取评论总数 comments_count = float(comments['data']['count']) NewsStorage.setcmtnum(params.originalurl, int(comments['data']['count'])) if comments_count == 0: return # 比较上次抓取该url的页面评论量和当前取到的评论量 cmtnum = CMTStorage.getcmtnum(params.originalurl, True) if cmtnum >= comments_count: return # 循环拼接评论url,提交下载平台获取评论数据 for page in range( 0, int(math.ceil(comments_count / Ku6Comments.PAGE_SIZE)) + 1, 1): commentUrl = Ku6Comments.COMMENTS_URL % ( oid, Ku6Comments.PAGE_SIZE, page + 1) self.storeurl(commentUrl, params.originalurl, Ku6Comments.STEP_3, {'oid': oid}) #解析评论数据 elif params.step == Ku6Comments.STEP_3: commentsinfo = json.loads(params.content) if not commentsinfo['data']['list']: return for comment in commentsinfo['data']['list']: curtime = TimeUtility.getuniformtime( int(comment['commentCtime'])) content = comment['commentContent'] nick = comment['commentContent'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: Logger.printexception()
def step3(self, params): jsondata = json.loads(params.content) for comment in jsondata['data']: content = comment['content'] commentid = comment['id'] curtime = getuniformtime(comment['comment_time']) nick = "none" if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def news_step2(self, params): """通过评论的url获取评论""" try: jsondata = json.loads(params.content) for comment in jsondata['body']: content = str(comment['commentbody']) nick = str(comment['commentauthor']) curtime = TimeUtility.getuniformtime(comment['commentdate']) if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) if int(comment['reply_total']) > 0: for index in range(0, int(comment['reply_total']), 1): content = comment['reply'][index]['commentbody'] curtime = TimeUtility.getuniformtime(comment['reply'][index]['commentdate']) nick = comment['reply'][index]['commentauthor'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except: Logger.printexception()
def step3(self, params): # Step3: 通过Step2设置的url,得到所有评论,抽取评论 commentsinfo = json.loads(params.content[2:-1]) contents = commentsinfo['data']['weibo'] for item in contents: curtime = TimeUtility.getuniformtime(item['pub_time']) content = item['content'] nick = str(item['userinfo']['nickname']) if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def step3(self, params): Logger.getlogging().info("ToutiaoNewsComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 jsoncontent = json.loads(params.content) if jsoncontent: if len(jsoncontent['data']) == 0: return for item in jsoncontent['data']: content = item['comment']['text'] curtime = TimeUtility.getuniformtime(item['comment']['create_time']) if not CMTStorage.exist(params.originalurl, content, curtime, ''): CMTStorage.storecmt(params.originalurl, content, curtime, '')
def getcomments_step3(self, params): xhtml = XPathUtility(html=params.content) contents = xhtml.getlist('//*[contains(@id,"partThreadContent")]') curtimes = xhtml.getlist('//*[@class="comment_rw"]/span/em') nicks = xhtml.getlist('//*[@class="wzbox"]/h5') for index in range(0, len(contents), 1): curtime = TimeUtility.getuniformtime(curtimes[index]+':00') content = str(contents[index]) nick = str(nicks[index]) if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def process(self, params): try: if params.step is None: # 拼接第一页评论url comments_url = PcautoComments.COMMENTS_URL % (params.originalurl, 1, PcautoComments.PAGE_SIZE) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, PcautoComments.STEP_2) #获取第一页评论内容,循环获取全部评论url elif params.step == PcautoComments.STEP_2: # 获取评论的Jason返回值 comments = json.loads(params.content) # 获取评论页数 comments_count = int(comments['total']) NewsStorage.setcmtnum(params.originalurl, comments_count) if comments_count == 0: return # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 循环拼接评论url,提交下载平台获取评论数据 for page in range(1, page_num + 1, 1): commentUrl = PcautoComments.COMMENTS_URL % (params.originalurl, page, PcautoComments.PAGE_SIZE) self.storeurl(commentUrl, params.originalurl, PcautoComments.STEP_3) #解析评论数据 elif params.step == PcautoComments.STEP_3: commentsinfo = json.loads(params.content) comments = [] for comment in commentsinfo['data']: updatetime = comment['createTime'] content = comment['content'] curtime = TimeUtility.getuniformtime(updatetime) try: nick = comment['nickName'] except: nick = 'anonymous' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) # if URLStorage.storeupdatetime(params.originalurl, updatetime): # cmti = CommentInfo() # cmti.content = comment['content'] # comments.append(cmti) # if len(comments) > 0: # self.commentstorage.store(params.originalurl, comments) except Exception, e: traceback.print_exc()
def step3(self, params): Logger.getlogging().info("xinhuaComments.STEP_3") # Step3: 通过Step1设置的urls,得到所有评论,抽取评论 comment_json = json.loads(params.content) for key in range(0, len(comment_json['contentAll'])): curtime = TimeUtility.getuniformtime( comment_json['contentAll'][key]['commentTime']) content = comment_json['contentAll'][key]['content'] nick = comment_json['contentAll'][key]['nickName'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def step2(self, params): """获取评论,和评论的评论url""" item = params.customized['item'] artId = params.customized['artId'] page = params.customized['page'] #step1.先定位到atl-main/atl-item,取其中的主评论正文,时间,replyid,子评论数 #step2.通过 子评论数/10 获取自评论页数pageNum #step3.通过 merNum,rellyid,pageNum拼出子评论id soup = BeautifulSoup(params.content, 'html5lib') alt_items = soup.select('.atl-main > .atl-item') #print 'alt_items:',len(alt_items) if page == 1: alt_items = alt_items[1:] for alt_item in alt_items: curtime = alt_item.select('.atl-head > div.atl-info > span') curtime = getuniformtime(curtime[-1].get_text()) main_comment = alt_item.select_one('.bbs-content').get_text() replyid = alt_item.select_one('a[class="reportme a-link"]').get( 'replyid') content = main_comment.strip() commentid = replyid nick = alt_item.select_one('a[class="js-vip-check"]').get_text() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) child_comment_num = alt_item.select_one( 'a[class="a-link-2 ir-remark"]').get_text() if self.r.search('\d+', child_comment_num): child_comment_num = self.r.parse('\d+', child_comment_num)[0] else: child_comment_num = 0 continue pageNum = int(math.ceil(float(child_comment_num) / self.page_size)) for page in range(1, int(pageNum) + 1): child_url = self.COMMENTS_CHILD_URL.format(item=item, artId=artId, replyId=replyid, page=page) #print 'child_url:',child_url self.storeurl(child_url, params.originalurl, self.STEP_COMMENT_CHILD_PAGE, { 'item': item, 'artId': artId })
def step3(self,params): """通过评论的url获取评论""" try: jsondata = json.loads(params.content) if jsondata['comments']: for comment in jsondata['comments']: content = comment['content'] curtime = TimeUtility.getuniformtime(comment['create_time']) nick = comment['passport'].get('nickname','anonymous') if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) reply = comment['comments'] while reply: for comment in comment['comments']: content = comment['content'] curtime = TimeUtility.getuniformtime(comment['create_time']) nick = comment['passport'].get('nickname','anonymous') if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) reply = comment['comments'] except: Logger.printexception() Logger.getlogging().error('extract comment error from {site}'.format(site = params.url))
def getcomments_step3(self, params): xhtml = XPathUtility(html=params.content) contents = xhtml.getlist('//*[@class="wz"]/p') curtimes = xhtml.getlist('//*[@class="fr"]') nicks = xhtml.getlist('//*[@class="wzbox"]/h5') for index in range(0, len(contents), 1): curtime = curtimes[index][4:] + ':00' Logger.getlogging().debug(contents[index]) content = str(contents[index]) nick = str(nicks[index]) if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)