def getcomments_step1(self, params): try: tid = re.findall('/p/(\d+)', params.originalurl) if tid: tid = tid[0] else: return fid = self.r.getid('forum_id', params.content) soup = BeautifulSoup(params.content, "html5lib") body = soup.find(attrs={'id': re.compile('post_content')}) if body: NewsStorage.setbody(params.originalurl, body.get_text()) else: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_XPATHVALUE) count = soup.select('.l_posts_num > .l_reply_num > span') if count: comment_count = count[0].get_text() page_num = count[1].get_text() else: comment_count = 0 page_num = 1 if int(page_num) > self.maxpages: page_num = self.maxpages # 拼接获取uniqid的url for page in range(1, int(page_num) + 1): flag = True if page == 1: params.customized['page'] = 1 flag = self.getpagecomments_step2(params) if fid: if not flag: break reply_url = BaiduTiebaComments.REPLY_URL.format(tid=tid, fid=fid, pn=page) self.storeurl(reply_url, params.originalurl, BaiduTiebaComments.BAIDU_TIEBA_HUIFU_PAGE) if page == 1: continue comment_url = BaiduTiebaComments.COMMENT_URL.format(tid=tid, page=page) self.storeurl(comment_url, params.originalurl, BaiduTiebaComments.BAIDU_TIEBA_EACH_PAGE, {'page': page}) except: Logger.printexception()
def step1(self, params): if self.r.match('http://www.gameres.com/\d+.html', params.originalurl): docurl = self.r.parse('^http://www\.gameres\.com\/(\d+)', params.originalurl)[0] numtext = XPathUtility(params.content).getstring('//p[@class="xg1"]') content = XPathUtility(params.content).xpath('//*[contains(@id,"postmessage")]/text()')[0] NewsStorage.setbody(params.originalurl, content) curcmtnum = int(self.r.getid(u'评论数', params.content, split=':')) # 判断增量 NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): if page == 1: self.step3(params) continue commentinfo_url = GameresNewsComments.COMMENTS_URL.format(docurl = docurl, page = page) self.storeurl(commentinfo_url, params.originalurl, GameresNewsComments.STEP_3,{'page': page}) else: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_TEMPLATE)
def step1(self,params): """""" #print params.content try: website = re.findall('http://(.*?)/',params.originalurl)[0] # re.search('^http://[(bbs)|(gz)|(moba)].*/\w+-\d+-\d+-\d.*',params.originalurl): if re.search('^http://[(bbs)|(gz)|(moba)|(gxdmw)].*/\w+-\d+-\d+-\d.*',params.originalurl): area = re.findall('com/(\w+?)-',params.originalurl)[0] url_id = re.findall('\d+',params.originalurl)[-3] elif re.search('^http://[(www)|(xsbbs)|(bbs)|(moba)].*/forum\.php\?mod=\w+(&fid=\d+)?&tid=\d+',params.originalurl): area = re.findall('mod=(\w+?)&',params.originalurl)[0] url_id = re.findall('tid=(\d+)',params.originalurl)[0] else: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE) return soup = BeautifulSoup(params.content,'html5lib') #主贴内容、时间、查看数、回复数,页面数 main_content = soup.find(attrs={'id':re.compile(self.commentCsskey['content_idkey'])}) if main_content: main_content = main_content.get_text() NewsStorage.setbody(params.originalurl, main_content) else: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_XPATHVALUE) return curtimeobj = soup.find(attrs={'id':re.compile(self.commentCsskey['time_idkey'])}) if curtimeobj: if curtimeobj.select_one('span'): curtime = curtimeobj.select_one('span').get('title') else: curtime = curtimeobj.get_text() NewsStorage.setpublishdate(params.originalurl, TimeUtility.getuniformtime(curtime)) if re.search('^http://www\.gxdmw\.com/.*',params.originalurl): #只是针对http://www.gxdmw.com/网站 cmtnum = soup.find(attrs={'class':"vwthdreplies y"}) curcmtnum = cmtnum.select_one('strong').get_text() else: cmtnum = soup.select(self.cmt_page_numCSS['cmtnumCss']) cmt_read = cmtnum[0].get_text() curcmtnum = cmtnum[1].get_text() curcmtnum = int(curcmtnum) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return #直接取 pageObj = soup.select(self.cmt_page_numCSS4['pageCss']) if pageObj: Logger.getlogging().debug('first get pageObj:%s'% pageObj[-2].get_text().strip('.')) if not pageObj: pageObj = soup.select(self.cmt_page_numCSS['pageCss']) if not pageObj: pageObj = soup.select(self.cmt_page_numCSS2['pageCss']) if not pageObj: pageObj = soup.select(self.cmt_page_numCSS3['pageCss']) if pageObj: page_num = pageObj[-2].get_text().strip('.') else: page_num = 1 #此部分只能具体网站可做具体的页面数量限制 if re.search('^http://bbs\.(17173|17k|gamersky)\.com/.*', params.originalurl): page_size = self.page_size2 elif re.search('^http://bbs\.78dm\.net/.*', params.originalurl): page_size = self.page_size3 else: page_size = self.page_size start = int(dbcmtnum / page_size) + 1 end = int(page_num) if end > start+ self.maxpages: start = end - self.maxpages params.customized['page'] = 1 if end == 1: self.step2(params) return if start == 1: self.step2(params) #获取最后一页 if re.search('^http://[(bbs)|(gz)|(moba)|(gxdmw)].*/\w+-\d+-\d+-\d.*',params.originalurl): url = self.COMMENTS_URL.format(website=website,area=area,url_id=url_id,page=end) if re.search('^http://[(www)|(xsbbs)|(bbs)|(moba)].*/forum\.php\?mod=\w+(&fid=\d+)?&tid=\d+',params.originalurl): url = self.FORUM_URL.format(website=website,area=area,url_id=url_id,page=end) if url: self.storeurl(url, params.originalurl, self.STEP_1_2,{'page':end, 'start':start, 'end':end, 'website':website, 'area':area, 'url_id':url_id}) #for page in range(end, start-1, -1): ##if int(page) == end: ##params.customized['page'] = 1 ##if not self.step2(params): ##break ##continue #if re.search('^http://[(bbs)|(gz)|(moba)|(gxdmw)].*/\w+-\d+-\d+-\d.*',params.originalurl): #url = self.COMMENTS_URL.format(website=website,area=area,url_id=url_id,page=page) #if re.search('^http://[(www)|(xsbbs)|(bbs)|(moba)].*/forum\.php\?mod=\w+(&fid=\d+)?&tid=\d+',params.originalurl): #url = self.FORUM_URL.format(website=website,area=area,url_id=url_id,page=page) #if url: #self.storeurl(url, params.originalurl, self.STEP_COMMENT_EACH_PAGE,{'page':page}) except: Logger.printexception() Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
def process(self, params): try: if params.step is NewsComments.STEP_1: #Step1: 通过原始url得到Key,得到评论总数和指向评论的url comments_url = self.SY_COMMENTS_URL.format( oriurl=params.originalurl, pageSize=self.PAGESIZE, page=1) #获取正文 html = XPathUtility(params.content) body = html.getstring( '//*[@class="article-content"]//p/text()') if body: NewsStorage.setbody(params.originalurl, body) else: Logger.getlogging().debug( 'Maybe no content for {url}!'.format( url=params.originalurl)) self.storeurl(comments_url, params.originalurl, self.STEP_2) elif params.step == NewsComments.STEP_2: try: jsondata = json.loads(params.content) if 'total' in jsondata: comments_count = jsondata['total'] NewsStorage.setcmtnum(params.originalurl, comments_count) # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int( math.ceil( float(comments_count - cmtnum) / self.PAGESIZE)) if page_num >= self.maxpages: page_num = self.maxpages for page in range(1, page_num + 1, 1): if page == 1: self.geturlcomments(params) continue comments_url = self.SY_COMMENTS_URL.format( oriurl=params.originalurl, pageSize=self.PAGESIZE, page=page) self.storeurl(comments_url, params.originalurl, self.STEP_3, {'total': comments_count}) else: Logger.getlogging().warning('{0}:30000'.format( params.originalurl)) except: Logger.getlogging().error('{0}:30000'.format( params.originalurl)) Logger.printexception() return elif params.step == NewsComments.STEP_3: # 获取评论 self.geturlcomments(params) else: pass except: Logger.printexception()
def process(self, params): try: if params.step is self.STEP_1: soup = BeautifulSoup(params.content, 'html5lib') body = soup.find(attrs={'class': 'post_message post_first'}) if body: NewsStorage.setbody(params.originalurl, body.get_text().strip()) else: Logger.getlogging().debug( '{url}:30000!'.format(url=params.originalurl)) keyvalue = params.url.split("/")[-1].split(".")[0] page = soup.select('.pager > a') if len(page) <= 2: page = 1 else: page = page[-2].get_text() page = int(re.findall('\d+', page)[0]) if self.pagelimit: if int(page) > self.pagelimit: Logger.getlogging().warning( 'the pageMaxNumber is shutdown to {0}'.format( self.pagelimit)) page = self.pagelimit for pg in range(1, int(page + 1)): comments_url = self.COMMENTS_URL % (keyvalue + '-' + str(pg)) self.storeurl(comments_url, params.originalurl, self.STEP_2, { 'page': pg, 'pagetotal': page }) elif params.step is self.STEP_2: #self.get_comments(params) page = params.customized['page'] soup = BeautifulSoup(params.content, 'html5lib') posts = soup.select('.post_wrap') if not posts: Logger.getlogging().debug( '{url}:30000!'.format(url=params.originalurl)) return for post in posts: post_msg = post.select_one('.post_message').get_text() post_msg = ''.join(post_msg.split()) # class ="user-42845238 post_time needonline " > 发表于 2017-07-27 23:53 post_time = post.find( attrs={ 'class': re.compile('user-.+post_time needonline') }).get_text() curtime = TimeUtility.getuniformtime(post_time) content = post_msg.strip() try: # class ="user-40693231 needonline" > Akemi隅晖 < / a > nick = post.find( attrs={ 'class': re.compile('user-.+ needonline') }).get_text() except: nick = 'nickname' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except: Logger.printexception()