def step1(self, params): # 取得url中的id articleId = self.r.parse(r'^https://movie\.douban\.com/\w+/(\d+)', params.url)[0] # 取得评论件数 xpathobj = XPathUtility(params.content) text = xpathobj.getstring( xpath='//*[@id="comments-section"]//h2/*[@class="pl"]/a') numtext = self.r.parse('\d+', text) if not numtext: return curcmtnum = float(numtext[0]) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): url = doubanComments.COMMENTS_URL.format(articleId=articleId, start=(page - 1) * self.PAGE_SIZE, pagesize=self.PAGE_SIZE) self.storeurl(url, params.originalurl, doubanComments.STEP_2)
def step1(self, params): key = params.customized['key'] srchfrom = params.customized['srchfrom'] xpath = XPathUtility(params.content) text = xpath.getstring('//*[@id="main"]/span') tstr = u'搜索总条数' if not self.r.search(tstr, text): Logger return num = self.r.parse('\d+', text)[0] pages = int(math.ceil(float(num) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages querylist = [] for page in range(1, pages + 1): if page == 1: self.step2(params) continue url = TGbusS2Query.TGBUS_QUERY_TEMPLATE.format(key=key, page=page, srchfrom=srchfrom) querylist.append(url) if querylist: self.__storeqeuryurllist__(querylist, TGbusS2Query.TGBUS_S2QUERY_EACH_PAGE, {'key': key})
def process(self, params): try: if params.step is None: # 根据html内容获取评论总数 xhtml = XPathUtility(html=params.content) countsStr = str( xhtml.getstring('//*[@id="chartForm"]/div[1]/a[3]')) startpos = countsStr.find('(') if startpos < 0: Logger.getlogging().error(params.originalurl) return comment_counts = int(countsStr[startpos + 1:countsStr.find(')')]) Logger.getlogging().debug(comment_counts) if comment_counts == 0: return # 比较上次抓取该url的页面评论量和当前取到的评论量 # # 循环拼接评论url,提交下载平台获取评论数据 for page in range( 1, int( math.ceil(comment_counts / Cine107Comments.PAGE_SIZE)) + 1, 1): commentUrl = Cine107Comments.COMMENTS_URL.format( url=params.originalurl, pageno=page) Logger.getlogging().debug(commentUrl) self.storeurl(commentUrl, params.originalurl, Cine107Comments.STEP_2) URLStorage.setcmtnum(params.originalurl, comment_counts) #解析评论数据 elif params.step == Cine107Comments.STEP_2: xhtml = XPathUtility(html=params.content) comments = [] contents = xhtml.getlist( '//*[@class="flow_commont_list clearfix"]/p') updatetimes = xhtml.getlist('//*/time') for index in range(0, len(contents), 1): udpatetime = TimeUtility.getuniformtime(updatetimes[index]) if URLStorage.storeupdatetime(params.originalurl, udpatetime): cmti = CommentInfo() Logger.getlogging().debug(contents[index]) cmti.content = str(contents[index]) comments.append(cmti) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) except: Logger.printexception()
def getcomments_step2(self, params): bookId = params.customized['bookId'] xhtml = XPathUtility(html=params.content) comments_count = int(xhtml.getstring('//*[@class="fr"]/em')) Logger.getlogging().debug(comments_count) # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return NewsStorage.setcmtnum(params.originalurl, comments_count) page_num = int(math.ceil(float(comments_count - cmtnum) / self.page_size)) if page_num >= self.maxpages: page_num = self.maxpages for page in range(1, page_num + 1, 1): comment_url = BookComments.COMMENTS_URL self.storeposturl(comment_url, params.originalurl, BookComments.STEP_3,{'bookId': bookId,'pageNum':page})
def setpubtime(self, params): newtime = None if re.search('http://chanye\.18183\.com/.*', params.url): Xhtml = XPathUtility(params.content) timestr = Xhtml.getstring( '//*[@class="arc-other"]/span[3]|//*[@class="other"]/span[3]') if not timestr: return p = '(\d{2}-\d+-\d+)' if re.search(p, timestr): new = str(time.localtime()[0])[0:2] + re.findall(p, timestr)[0] newtime = getuniformtime(new) #if re.search('http://bbs\.18183\.com/.*',params.url): #Xhtml = XPathUtility(params.content) #timestr = Xhtml.getstring('//*[@class="authi"]/em') #if not timestr: #return #times = timestr.split(u'发表于')[1] #newtime = TimeUtility.getuniformtime(times) if newtime: NewsStorage.setpublishdate(params.originalurl, newtime)
def step2(self, params): # html = etree.HTML(params.content) # 通过xpath得到评论总数 # comment_count_xpath = html.xpath('//*[@id="comment_count"]') xhtml = XPathUtility(html=params.content) comment_count_xpath = xhtml.getstring('//*[@id="comment_count"]') if not comment_count_xpath: return comment_count = int(comment_count_xpath) # 检查页面评论量 cmtnum = CMTStorage.getcount(params.originalurl) if cmtnum >= comment_count: return NewsStorage.setcmtnum(params.originalurl, comment_count) # 获取第一页评论 self.geturlcomments(params) # 计算出最后一页评论的页数 aid = params.customized['aid'] site = params.customized['site'] channelid = params.customized['channelid'] title = params.customized['title'] # 计算所需取的评论页数,最大10页 lastpg = int(math.ceil(float(comment_count - cmtnum) / self.PAGE_SIZE)) if lastpg >= self.maxpages: lastpg = self.maxpages # 拼出第一页之外的评论的url for page in range(2, lastpg + 1, 1): comement_url = self.COMMENT_URL.format(pg=page, aid=aid, site=site, channelid=channelid, oriurl=params.originalurl, title=title) self.storeurl(comement_url, params.originalurl, NewsComments.STEP_3_NEWS)
def step2(self, params): Logger.getlogging().info("Flash8Comments.STEP_2") # 将STEP_1中的docurl传下来 docurl = params.customized['docurl'] xparser = XPathUtility(params.content) commentsinfo = xparser.getstring('//div[@class="page"]/span/font[1]') # 保存页面评论量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= int(commentsinfo[0]): return URLStorage.setcmtnum(params.originalurl, int(commentsinfo[0])) # 总数除以page_size,然后加1,可得到评论总页数comments_count pagecount = xparser.getnumber('//*[@class="pg"]/label/span') if pagecount == 0: pagecount = pagecount + 1 for page in range(1, pagecount + 1, 1): comment_url = Flash8Comments.COMMENT_URL.format(docurl=docurl, page=page) self.storeurl(comment_url, params.originalurl, Flash8Comments.STEP_3, {'page': page})
def process(self, params): try: if params.step is NewsComments.STEP_1: #Step1: 通过原始url得到Key,得到评论总数和指向评论的url comments_url = self.SY_COMMENTS_URL.format( oriurl=params.originalurl, pageSize=self.PAGESIZE, page=1) #获取正文 html = XPathUtility(params.content) body = html.getstring( '//*[@class="article-content"]//p/text()') if body: NewsStorage.setbody(params.originalurl, body) else: Logger.getlogging().debug( 'Maybe no content for {url}!'.format( url=params.originalurl)) self.storeurl(comments_url, params.originalurl, self.STEP_2) elif params.step == NewsComments.STEP_2: try: jsondata = json.loads(params.content) if 'total' in jsondata: comments_count = jsondata['total'] NewsStorage.setcmtnum(params.originalurl, comments_count) # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int( math.ceil( float(comments_count - cmtnum) / self.PAGESIZE)) if page_num >= self.maxpages: page_num = self.maxpages for page in range(1, page_num + 1, 1): if page == 1: self.geturlcomments(params) continue comments_url = self.SY_COMMENTS_URL.format( oriurl=params.originalurl, pageSize=self.PAGESIZE, page=page) self.storeurl(comments_url, params.originalurl, self.STEP_3, {'total': comments_count}) else: Logger.getlogging().warning('{0}:30000'.format( params.originalurl)) except: Logger.getlogging().error('{0}:30000'.format( params.originalurl)) Logger.printexception() return elif params.step == NewsComments.STEP_3: # 获取评论 self.geturlcomments(params) else: pass except: Logger.printexception()
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is Rain8Comments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url参数。 articleId = self.r.parse('http://\w+\.tadu\.com/\w+/(\d+).*', params.originalurl)[0] # 取得评论的url列表 comments_url = Rain8Comments.COMMENT_URL.format (articleId = articleId,page = 1) self.storeurl(comments_url, params.originalurl, Rain8Comments.STEP_2, {'articleId': articleId}) elif params.step == Rain8Comments.STEP_2: # 获得评论参数 articleId = params.customized['articleId'] # 取得总件数 #comment_count = float(self.r.getid('total', params.content)) xparser = XPathUtility(params.content) countstr = xparser.getstring('//h4') if self.r.search(u'\d+', countstr): comment_count = self.r.parse(u'(\d+)', countstr)[1] if comment_count == 0: return # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comment_count: return URLStorage.setcmtnum(params.originalurl, comment_count) # 获取页数 totalPage = int(math.ceil(float(comment_count) / TaDuComments.PAGE_SIZE)) # 获得url列表 for page in range(1, totalPage+1 , 1): url = TaDuComments.COMMENT_URL.format(articleId = articleId,page = page) self.storeurl(url, params.originalurl, TaDuComments.STEP_3) elif params.step == TaDuComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 Logger.getlogging().info("params.step == 3") # 取得所有评论 xparser = XPathUtility(params.content) comments = xparser.getlist('//ul[@class="cmetlist bookreview-cmetlist"]/li/div/div[2]/p') # 取得所有评论时间 commenttimes = xparser.getlist('//ul[@class="cmetlist bookreview-cmetlist"]/li/div/div[2]/span') commentsInfo = [] # 取得所有评论 for index in range(0, int(len(comments)), 1): # 提取时间 publicTime = commenttimes[index][3:] cmti = CommentInfo() tm = TimeUtility.getuniformtime(publicTime,'%Y-%m-%d %H:%M') if URLStorage.storeupdatetime(params.originalurl, tm): cmti.content = comments[index].strip() commentsInfo.append(cmti) # 保存获取的评论 if len(commentsInfo) > 0: self.commentstorage.store(params.originalurl, commentsInfo) else: Logger.getlogging().error('proparam.step == {step}'.format(step = params.step)) except Exception,e: traceback.print_exc()
def process(self, params): try: if params.step is BookComments.STEP_1: #Step1: 通过原始url得到Key,得到获取评论的首页url urlsplit = params.originalurl.split('/') if len(urlsplit[-1].strip()) > 0: key = urlsplit[-1] else: key = urlsplit[-2] field = params.customized['field'] if field == 'manhua': comments_url = self.MANHUA_COMMENTS_URL.format(key=key, pg=1) hxpath = XPathUtility(params.content) pubTime = hxpath.getstring( '//*[@class="synopsises_font"]/li[2]/text()', ' ') if pubTime: pubTime = pubTime[0] pubTime = re.findall('\d+/\d+/\d+', params.content)[0] info = BaseInfoStorage.getbasicinfo(params.originalurl) info.pubtime = pubTime BaseInfoStorage.store(params.originalurl, info) elif field == 'book': comments_url = self.BOOK_COMMENTS_URL.format(key=key, pg=1) else: return self.storeurl(comments_url, params.originalurl, self.STEP_2, { 'key': key, 'field': field }) elif params.step == BookComments.STEP_2: html = etree.HTML(params.content) comments_total_xpath = html.xpath( '//*[@class="content_title"]/span/a') if comments_total_xpath: comments_total_str = self.r.parse( u'(\d+)', comments_total_xpath[0].text.replace(',', '')) if not comments_total_str: return comments_total = int(comments_total_str[0]) cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_total: return URLStorage.setcmtnum(params.originalurl, comments_total) # 获取首页评论 self.geturlcomments(params) if comments_total > self.limit: page_max = int( math.ceil(float(comments_total) / self.limit)) # 拼出首页之外的所有评论url key = params.customized['key'] field = params.customized['field'] if field == 'manhua': for page in range(2, page_max + 1, 1): comments_url = self.MANHUA_COMMENTS_URL.format( key=key, pg=page) self.storeurl(comments_url, params.originalurl, self.STEP_3) elif field == 'book': for page in range(2, page_max + 1, 1): comments_url = self.BOOK_COMMENTS_URL.format( key=key, pg=page) self.storeurl(comments_url, params.originalurl, self.STEP_3) else: return elif params.step == BookComments.STEP_3: # 获取评论 self.geturlcomments(params) else: pass except: Logger.printexception()