def step3news(self, params): Logger.getlogging().info("ZolbbsComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="commli"]/p') commentstime = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="published-time"]') commentsnick = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="user-name"]') # 获取评论,设置实际的评论量 for index in range(0, len(commentstime), 1): # 提取时间 tm = commentstime[index].strip() try: curtime = TimeUtility.getuniformtime(getuniformtime(tm), u'%Y-%m-%d %H:%M') except Exception, e: curtime = getuniformtime(tm) # 提取评论内容 content = commentsinfo[index] nick = commentsnick[index] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def step3(self, params): Logger.getlogging().info("Flash8Comments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 page = params.customized['page'] xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments('//td[@class="t_f"]') #commentstime = self.r.parse(ur'发表于 (\d+-\d+-\d+ \d+:\d+)</em>', params.content) commentstime = xparser.getcomments('//div[@class="authi"]/em') comments = [] # 获取评论 # 设置实际的评论量 if page is 1: statrIndex = 1 else: statrIndex = 0 for index in range(statrIndex, len(commentstime), 1): cmti = CommentInfo() if URLStorage.storeupdatetime(params.originalurl, commentstime[index]): # 获取增加的评论(根据时间比较) cmti.content = commentsinfo[index] comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def step1(self, params): # 获得首页url参数 info = params.customized['query'] xparser = XPathUtility(params.content) if not xparser.xpath('//*[@class="mytopic topiclisttr"]'): Logger.log(params.url, constant.ERRORCODE_WARNNING_NORESULTS) return pageList = xparser.getcomments('//span[@class="right"]/a') if len(pageList) == 1: pageTotal = 1 else: pageTotal = pageList[len(pageList) - 2] if int(pageTotal) >= self.maxpages: pageTotal = self.maxpages # 所有循环列表 querylist = [] # 根据总页数,获取query列表 for page in range(1, int(pageTotal) + 1, 1): if page == 1: self.step2(params) continue url = hupuS2Query.HUPU_QUERY_TEMPLATE.format(q=info, pn=page) querylist.append(url) self.__storeqeuryurllist__(querylist, hupuS2Query.HUPU_S2QUERY_EACH_PAGE, {'query': info})
def bbs_step3(self, params): try: xparser = XPathUtility(params.content) page = params.customized['page'] pagecount = params.customized['pagecount'] comments = [] updatetimes = [] nicks = [] contents = xparser.getcomments('//*[@class="read"]') mid_times = xparser.getlist('//td[@class="authorname"]') for times in mid_times: updatetimes.append(self.r.parse(ur'于(\d+-\d+-\d+ \d+:\d+:\d+)留言', times)[0]) nicks.append(self.r.parse(ur'(.*)于', times)[0]) if page == 0: mid_index = 1 elif page > 0: mid_index = 0 comments_number = xparser.getnumber('//*[@id="msgsubject"]/font') if comments_number != 0: for index in range(mid_index, len(contents), 1): curtime = TimeUtility.getuniformtime(updatetimes[index]) content = contents[index] nick = nicks[index].split('于')[0].split('☆')[-1] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: traceback.print_exc()
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is Dm123BbsComments.STEP_1: xparser = XPathUtility(params.content) #通过第一次传进来的URL判断是否有后续页面 keyvalue = self.r.parse('tid-(.*?).html', params.url)[0] pagecount = xparser.getnumber( '//*[@class="pages"]/div[@class="fl"]') commentinfo_url = params.url self.storeurl(commentinfo_url, params.originalurl, Dm123BbsComments.STEP_2, { 'keyvalue': keyvalue, 'totalpage': pagecount, 'curpage': 1 }) elif params.step == Dm123BbsComments.STEP_2: keyvalue = params.customized['keyvalue'] curpage = params.customized['curpage'] xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments( '//div[contains(@class,"tpc_content")]') commentstime = self.r.parse(ur'\"(\d+-\d+-\d+ \d+:\d+)\">发表于:', params.content) comments = [] for index in range(0, len(commentstime)): cmti = CommentInfo() if URLStorage.storeupdatetime( params.originalurl, TimeUtility.getuniformtime(commentstime[0] + ':00')): # 获取增加的评论(根据时间比较) cmti.content = commentsinfo[index] comments.append(cmti) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) nextpageList = [keyvalue, "-page-", str(curpage + 1)] nextpage = '' nextpage = nextpage.join(nextpageList) if int(nextpageList[2]) <= int(params.customized['totalpage']): comment_url = Dm123BbsComments.COMMENT_URL.format( page=nextpage) self.storeurl( comment_url, params.originalurl, Dm123BbsComments.STEP_2, { 'keyvalue': nextpageList[0], 'totalpage': params.customized['totalpage'], 'curpage': curpage + 1 }) except Exception, e: traceback.print_exc()
def geturlcomments(self, params): xparser = XPathUtility(params.content) # 取回所有评论 page = params.customized['page'] if page == 1: commentstimes = xparser.getcomments( '//table[position()>1]/tbody/tr/td/span[1]') commentscontents = xparser.getcomments( '//table[position()>1]/tbody/tr[2]/td[@class="post-main"]') commentsnicks = xparser.getcomments('//*[@class="name"]/a') else: commentstimes = xparser.getcomments('//table/tbody/tr/td/span[1]') commentscontents = xparser.getcomments( '//table/tbody/tr[2]/td[@class="post-main"]') commentsnicks = xparser.getcomments('//*[@class="name"]/a') # 设置实际的评论量 for index in range(0, len(commentscontents), 1): curtime = TimeUtility.getuniformtime(commentstimes[index][4:]) # 提取评论内容 content = commentscontents[index].strip() nick = commentsnicks[index].strip() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def step3(self, params): Logger.getlogging().info("Dm123NewsComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 is_only_one_page = params.customized['is_only_one_page'] if is_only_one_page: commentsinfos = params.customized['commentsinfos'] commentstimes = params.customized['commentstimes'] else: xparser = XPathUtility(params.content) commentsinfos = xparser.getcomments('//div[@class="rbvalueout"]') commentstimes = xparser.getcomments('//span[@class="rbtime"]') comments = [] for index in range(0, len(commentstimes)): commentstime = commentstimes[index].strip() if URLStorage.storeupdatetime(params.originalurl, commentstime): cmti = CommentInfo() cmti.content = commentsinfos[index].strip() comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def step2(self, params): Logger.getlogging().info("Dm123NewsComments.STEP_2") classid = params.customized['classid'] id = params.customized['id'] xparser = XPathUtility(params.content) # 评论总数(当评论不满一页时,直接获取到的comments_count为0) comments_count = xparser.getnumber('//div/a[1]/b') # comments_count为0时分两种情况,真的没有评论和有评论 if 0 == comments_count: commentsinfos = xparser.getcomments('//div[@class="rbvalueout"]') commentstimes = xparser.getcomments('//span[@class="rbtime"]') # comments_count重新赋值 comments_count = len(commentsinfos) if 0 == comments_count: return else: # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) self.storeurl(params.originalurl, params.originalurl, Dm123NewsComments.STEP_3, {'is_only_one_page': True, 'commentsinfos': commentsinfos, 'commentstimes': commentstimes}) else: # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) # 评论页数 page_count = int(math.ceil(float(comments_count) / self.page_size)) for page in range(0, int(page_count), 1): comment_url = Dm123NewsComments.COMMENT_URL.format(page=page, classid=classid, id=id) self.storeurl(comment_url, params.originalurl, Dm123NewsComments.STEP_3, {'is_only_one_page': False})
def step1(self, params): """""" #Step1: 通过得到docurl,得到获取评论的url的参数。 #docurl = self.r.parse('^http://bbs\.hupu\.com\/(\d+)', params.originalurl) docurl = self.r.parse('^http[s]{0,1}://bbs\.hupu\.com\/(\d+)', params.originalurl) if docurl: docurl = docurl[0] else: Logger.getlogging().debug( '{url}:20000'.format(url=params.originalurl)) return # 取得正文 xparser = XPathUtility(params.content) #取得页数 pageList = xparser.getcomments('//div[@class="page"]/a') if not pageList: pagenum = 1 elif pageList: pagenum = pageList[-2] else: return if int(pagenum) >= self.maxpages: pagenum = self.maxpages # 评论总数 curcmtnum = xparser.getnumber('//span[@class="browse"]') NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return start = int(dbcmtnum / self.page_size) + 1 end = int(pagenum) if end > start + self.maxpages: start = end - self.maxpages params.customized['page'] = 1 if end == 1: self.step2(params) return if start == 1: self.step2(params) comment_url = self.COMMENT_URL.format(docurl=docurl, page=end) self.storeurl(comment_url, params.originalurl, hupuComments.STEP_1_2, { 'docurl': docurl, 'page': end, 'start': start, 'end': end })
def step3bbs(self, params): Logger.getlogging().info("Ea3wcomments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments('//p[@class="comment-content"]') commentstime = xparser.getcomments('//span[@class="time"]') comments = [] # 获取评论 for index in range(0, int(len(commentsinfo)), 1): # 提取时间 cmti = CommentInfo() cmti.content = commentsinfo[index] if str(commentstime[index]).strip().decode("utf8") == '刚刚'.decode( "utf8"): tm = getuniformtime(str(datetime.datetime.now())) else: tm = getuniformtime(str(commentstime[index])) if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is bookComments.STEP_1: # 取得url中的id articleId = self.r.parse( r'^http://www\.2200book\.com/files/article/\w+/\d+/(\d+)\.htm$', proparam.originalurl).__getitem__(0) # 取得评论首页 url = bookComments.COMMENTS_URL % (articleId, 1) self.storeurl(url, proparam.originalurl, bookComments.STEP_2, {'articleId': articleId}) elif proparam.step == bookComments.STEP_2: articleId = proparam.customized['articleId'] # 取得评论页数 xparser = XPathUtility(proparam.content) page_count = int( self.r.parse( ur'>>(\d+)', xparser.getcomments("//*[@id='pagelink']")[0])[0]) # 取得评论的页数 if int(page_count) == 0: return # 取得评论的url for page in range(1, int(page_count) + 1, 1): url = bookComments.COMMENTS_URL % (articleId, page) self.storeurl(url, proparam.originalurl, bookComments.STEP_3) elif proparam.step == bookComments.STEP_3: rids = re.findall(r'rid=(\d+)">', proparam.content) for rid in rids: url = bookComments.COMMENTS_URL_RID % (rid) self.storeurl(url, proparam.originalurl, bookComments.STEP_4) elif proparam.step == bookComments.STEP_4: commentsInfo = [] # 论坛 xparser = XPathUtility(proparam.content) comments = xparser.getcomments( '//*[@id="sp_2"]/p[2]|//*[@id="b_v_5"]') commentTimes = self.r.parse(ur'发表于(:| )?(.+)(</p>|</div>)', proparam.content) for index in range(0, int(len(comments)), 1): if URLStorage.storeupdatetime(proparam.originalurl, commentTimes[index][1]): cmti = CommentInfo() cmti.content = comments[index] commentsInfo.append(cmti) # 保存获取的评论 if len(commentsInfo) > 0: self.commentstorage.store(proparam.originalurl, commentsInfo) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is SeventeenKComments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url。 #Logger.getlogging().info("proparam.step is None") # 在视频url中取出docurl,^http://v\.ifeng\.com\/\w+\/\w+/\d{6}\/[0-9a-z-]+\.shtml # 取URL中的([0-9a-z-]+)参数,此参数为docurl docurl = self.r.parse( '^http://bbs\.17k\.com\/thread-(\d+)-\d+-1\.html', params.originalurl)[0] #Logger.getlogging().debug(docurl) # 评论首页URL为http://comment.ifeng.com/getv.php?job=1&docurl=([0-9a-z-]+)&p=1 commentinfo_url = 'http://bbs.17k.com/thread-{docurl}-1-1.html'.format( docurl=docurl) self.storeurl(commentinfo_url, params.originalurl, SeventeenKComments.STEP_2, {'docurl': docurl}) elif params.step == SeventeenKComments.STEP_2: #将STEP_1中的docurl传下来 docurl = params.customized['docurl'] # Step2: 通过Step1设置url,得到评论的总数,并根据评论总数得到获取其他评论的url。 #Logger.getlogging().info("params.step == 2") # 打开STEP1中URL,截取"count":num字段,取出num的值,num字段为评论总数 xparser = XPathUtility(params.content) commentsinfo = xparser.getnumber( '//*[@class="hm ptn"]/span[5]') #Logger.getlogging().debug(comments_count / self.page_size) #Logger.getlogging().debug(math.ceil(comments_count / self.page_size)) # 保存页面评论量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= int(commentsinfo): return URLStorage.setcmtnum(params.originalurl, int(commentsinfo)) # 总数除以page_size,然后加1,可得到评论总页数comments_count # 循环http://comment.ifeng.com/getv.php?job=1&docurl=([0-9a-z-]+)&p=comments_count,从一开始循环到上一步操作取到的数值,从而得到所有评论的URL,并保存 pagecount = xparser.getnumber('//*[@class="pg"]/label/span') for page in range(1, pagecount + 1, 1): comment_url = SeventeenKComments.COMMENT_URL.format( docurl=docurl, page=page) self.storeurl(comment_url, params.originalurl, SeventeenKComments.STEP_3, {'page': page}) elif params.step == SeventeenKComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 #Logger.getlogging().info("params.step == 3") page = params.customized['page'] xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments( '//*[contains(@id,"postmessage")]') commentstime = self.r.parse(ur'发表于 (\d+-\d+-\d+ \d+:\d+)</em>', params.content) comments = [] #获取评论 # 设置实际的评论量 if page is 1: statrIndex = 1 else: statrIndex = 0 for index in range(statrIndex, len(commentstime), 1): cmti = CommentInfo() if URLStorage.storeupdatetime(params.originalurl, commentstime[index] + ':00'): # 获取增加的评论(根据时间比较) cmti.content = commentsinfo[index] comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) else: Logger.getlogging().error( 'proparam.step == {step}'.format(step=params.step)) except Exception, e: traceback.print_exc()
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is ishangmanComments.STEP_1: # 取得url中的参数值 articleIds = re.findall( r'^http://(\w+)\.ishangman\.com/\w+/(\d+)', proparam.url).__getitem__(0) articleId1 = articleIds.__getitem__(0) articleId2 = articleIds.__getitem__(1) # 评论类型 commenttype = int( self.r.parse(ur'commenttype = (.*);', proparam.content)[0]) #第一页评论 url = ishangmanComments.COMMENTS_URL % (articleId1, articleId2, commenttype, 1) self.storeurl( url, proparam.originalurl, ishangmanComments.STEP_2, { 'articleId1': articleId1, 'articleId2': articleId2, 'commenttype': commenttype }) elif proparam.step == ishangmanComments.STEP_2: articleId1 = proparam.customized['articleId1'] articleId2 = proparam.customized['articleId2'] commenttype = proparam.customized['commenttype'] # 取得评论件数 xhtml = XPathUtility(html=proparam.content) if articleId1.__eq__('comic'): comments_count = int( xhtml.getlist( '//*[contains(@class,"ismcartondiv1")]/p/strong') [0]) if comments_count: NewsStorage.setcmtnum(proparam.originalurl, comments_count) else: comments_count = int( self.r.parse( ur'(\d+).*', xhtml.getlist('//*[@class="comment_lctwidl"]/p') [0])[0]) if comments_count: NewsStorage.setcmtnum(proparam.originalurl, comments_count) # 取得评论的页数 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if int(comments_count) == 0: return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 取得评论的url for page in range(1, page_num + 1, 1): url = ishangmanComments.COMMENTS_URL % ( articleId1, articleId2, commenttype, page) self.storeurl(url, proparam.originalurl, ishangmanComments.STEP_3, {'articleId1': articleId1}) elif proparam.step == ishangmanComments.STEP_3: try: Logger.getlogging().debug(proparam.originalurl) commentsInfo = [] articleId1 = proparam.customized['articleId1'] xparser = XPathUtility(proparam.content) # 取得评论件数 if articleId1.__eq__('comic'): # 论坛评论 soup = BeautifulSoup(proparam.content, 'html5lib') comments = soup.select('.ismcartondiv2') else: # 论坛评论 comments = xparser.getcomments( '/html/body/div/span[2]/p[1]') # 论坛评论时间 updateTime = xparser.getcomments( '/html/body/div/span[2]/div[1]') # 取得评论 for index in range(0, int(len(comments)), 1): cmti = [] if articleId1.__eq__('comic'): publictime = self.r.parse( ur'(\d{2}-\d+ \d+:\d+)', comments[index].get_text())[0] # publictime = TimeUtility.getuniformtime(publictime) if publictime: cmt_month = publictime.split("-")[0] curmonth = time.localtime().tm_mon if (int(cmt_month) < curmonth): publictime = TimeUtility.getcurrentdate( )[0:4] + '-' + publictime else: publictime = '2016' + '-' + publictime curtime = TimeUtility.getuniformtime(publictime) content = comments[index].text.split( '\n')[0].get_text() # # print comments; # return # content = self.r.parse(ur'class=\".*\"',comments[index].get_text())[0] # nick = comments[1].get('nickname', 'anonymous') # # if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): # CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) # if NewsStorage.storeupdatetime(proparam.originalurl, tm): # cmti.content = comments[index].get_text() # commentsInfo.append(cmti) else: publictime = updateTime[index][:-8] #publictime = TimeUtility.getcurrentdate()[0:4] + '-'+ publictime #tm = TimeUtility.getuniformtime(publictime, u'%Y-%m-%d %H:%M') tm = getuniformtime(publictime) if NewsStorage.storeupdatetime( proparam.originalurl, tm): cmti.content = comments[index] commentsInfo.append(cmti) # 保存获取的评论i if len(commentsInfo) > 0: self.commentstorage.store(proparam.originalurl, commentsInfo) except: Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format( site=proparam.url)) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is xinhuaBbsComments.STEP_1: # 取得url中的id articleId = re.findall( r'^http://forum\.home\.news\.cn/\w+/(\d+)/\d+\.html', proparam.originalurl).__getitem__(0) # 取得评论的url列表 comments_url = xinhuaBbsComments.COMMENTS_URL % (articleId, 1) self.storeurl(comments_url, proparam.originalurl, xinhuaBbsComments.STEP_2, {'articleId': articleId}) elif proparam.step == xinhuaBbsComments.STEP_2: articleId = proparam.customized['articleId'] xparser = XPathUtility(proparam.content) pages = xparser.getcomments( '//*[@id="postreply"]/div[2]/ul[1]/li/a') comments = xparser.getcomments( '//*[@id="postreply"]/dl/dd/div/p[2]') comments_count = len(comments) # 如果页数为0 if len(pages) == 0 and comments_count != 0: url = xinhuaBbsComments.COMMENTS_URL % (articleId, 1) self.storeurl(url, proparam.originalurl, xinhuaBbsComments.STEP_3) # 增量判断 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if cmtnum >= comments_count: return page_num = int(len(pages)) # 判断页数 if page_num >= self.maxpages: page_num = self.maxpages start = int(cmtnum / self.PAGE_SIZE) + 1 end = int(page_num) if end > start + self.maxpages: start = end - self.maxpages # 循环取得评论的url for page_num in range(end, start - 1, -1): # 取得评论的url url = xinhuaBbsComments.COMMENTS_URL % (articleId, page_num) self.storeurl(url, proparam.originalurl, xinhuaBbsComments.STEP_3) elif proparam.step == xinhuaBbsComments.STEP_3: # 取得评论 xparser = XPathUtility(proparam.content) comments = xparser.getcomments( '//*[@id="postreply"]/dl/dd/div/p[2]') cmtnum = CMTStorage.getcount(proparam.originalurl, True) comments_count = len(comments) NewsStorage.setcmtnum(proparam.originalurl, comments_count + cmtnum) # 评论存在的场合 if len(comments) != 0: # 取得发布时间 publicTimes = re.findall( ur'<li><span id="time_\d+">(\d+-\d+-\d+ \d+:\d+:\d+)发表</span></li>', proparam.content) publicIndex = 0 nicks = xparser.getcomments( '//*[@id="postreply"]/dl/dd/ul[1]/li[1][a]') for comment in comments: content = comment publictime = publicTimes[publicIndex] curtime = TimeUtility.getuniformtime(publictime) nick = nicks[publicIndex] if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) publicIndex = publicIndex + 1 else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is Xie17NewsComments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url参数。 articleId = self.r.parse('^http://xiaoshuo\.17xie\.com/book/(\d+)/', params.originalurl)[0] # 取得评论的url列表 comments_url = Xie17NewsComments.COMMENT_URL % (articleId, 1) self.storeurl(comments_url, params.originalurl, Xie17NewsComments.STEP_2, {'articleId': articleId}) elif params.step == Xie17NewsComments.STEP_2: # 获得评论参数 articleId = params.customized['articleId'] # 取得总件数 comment_count = float(self.r.parse(ur'共(\d+)人说过', params.content)[0]) if comment_count == 0: return # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comment_count: return URLStorage.setcmtnum(params.originalurl, comment_count) # 获取页数 page = int(math.ceil(comment_count / Xie17NewsComments.PAGE_SIZE)) # 获得url列表 for page in range(1, page + 1, 1): url = Xie17NewsComments.COMMENT_URL % (articleId, page) self.storeurl(url, params.originalurl, Xie17NewsComments.STEP_3) elif params.step == Xie17NewsComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 Logger.getlogging().info("params.step == 3") xparser = XPathUtility(params.content) # 取得所有评论 comments = xparser.getcomments('/html/body/ul/li[2]/dl/dd') # 取得所有评论时间 commenttimes = xparser.xpath('/html/body/ul/li[2]/dl/dt/text()') commentsInfo = [] # 取得所有评论 for index in range(0, int(len(commenttimes)), 1): # 提取时间 if self.r.search(ur'\d+年\d+月',commenttimes[index].strip()): tm = TimeUtility.getuniformtime(str(commenttimes[index]).strip(), '%Y年%m月') else: tm = getuniformtime(commenttimes[index].strip()) if URLStorage.storeupdatetime(params.originalurl, tm): cmti = CommentInfo() comment = comments[index * 3] + comments[index * 3 + 1] + comments[index * 3 + 2] cmti.content = comment commentsInfo.append(cmti) # 保存获取的评论 if len(commentsInfo) > 0: self.commentstorage.store(params.originalurl, commentsInfo) else: