def step2bbs(self, params): Logger.getlogging().info("Dm5Commnets.STEP_2") # 将STEP_1中的docurl传下来 docurl = params.customized['docurl'] comments_count = self.r.parse(ur'(\d+)个回复', params.content)[0] # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) # 总数除以page_size,然后加1,可得到评论总页数comments_count pagenum = 0 xparser = XPathUtility(params.content) if not xparser.xpath('//*[@class="inkk ma5"]'): Logger.getlogging().warning('{0}:30001'.format(params.originalurl)) return pageList = xparser.xpath('//*[@id="search_fy"]/a/text()') if not pageList: pagenum = 1 else: pagenum = int(pageList[-2]) for page in range(1, pagenum + 1, 1): comment_url = Dm5Commnets.COMMENT_URL.format(docurl=docurl, page=page) self.storeurl(comment_url, params.originalurl, Dm5Commnets.STEP_3_BBS)
def s2query(self): self.conf.setchannel(SPIDER_CHANNEL_S2) s2file = SpiderConfigure.getinstance().gets2file() file = FileUtility.getfilename(s2file) s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) + file if FileUtility.exists(s2temppath): with open(s2temppath, 'r') as fp: querylist = [] firstline = True for strquery in fp.readlines(): if firstline: firstline = False if strquery[:3] == codecs.BOM_UTF8: Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file)) strquery = strquery[3:] strquery = Common.strip(strquery) if not strquery: continue Logger.getlogging().info('S2 {query} start...'.format(query=strquery)) self.conf.setquery(strquery) URLStorage.updaterecycle() querylist.append(strquery) for site in self.factory.getall(): site.s2query(strquery.replace('&', ' ')) sitelist = [] for site in self.factory.getall(): if site.exists2(): sitelist.append(site) SpiderReport.loadquery(querylist) SpiderReport.loadsites(sitelist)
def preprocess(self, filepath): result = False context = URLStorage.getfilecontext(FileUtility.getfilename(filepath)) if context: self.conf.setchannel(context.channel) if context.channel == SPIDER_CHANNEL_S2: self.conf.setquery(context.query) else: self.conf.setquery('') URLStorage.updaterecycle() result = True return result
def step2bbs(self, params): Logger.getlogging().info("Ea3wcomments.STEP_2") commentinfo_url = params.customized['commentinfo_url'] + "&load=all" xparser = XPathUtility(params.content) comments_count = xparser.getnumber('//div[@class="at-comment"]/a/span') # 保存页面评论量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) self.storeurl(commentinfo_url, params.originalurl, Ea3wcomments.STEP_3_BBS)
def process(self, params): try: if params.step is None: # 根据html内容获取评论总数 xhtml = XPathUtility(html=params.content) countsStr = str( xhtml.getstring('//*[@id="chartForm"]/div[1]/a[3]')) startpos = countsStr.find('(') if startpos < 0: Logger.getlogging().error(params.originalurl) return comment_counts = int(countsStr[startpos + 1:countsStr.find(')')]) Logger.getlogging().debug(comment_counts) if comment_counts == 0: return # 比较上次抓取该url的页面评论量和当前取到的评论量 # # 循环拼接评论url,提交下载平台获取评论数据 for page in range( 1, int( math.ceil(comment_counts / Cine107Comments.PAGE_SIZE)) + 1, 1): commentUrl = Cine107Comments.COMMENTS_URL.format( url=params.originalurl, pageno=page) Logger.getlogging().debug(commentUrl) self.storeurl(commentUrl, params.originalurl, Cine107Comments.STEP_2) URLStorage.setcmtnum(params.originalurl, comment_counts) #解析评论数据 elif params.step == Cine107Comments.STEP_2: xhtml = XPathUtility(html=params.content) comments = [] contents = xhtml.getlist( '//*[@class="flow_commont_list clearfix"]/p') updatetimes = xhtml.getlist('//*/time') for index in range(0, len(contents), 1): udpatetime = TimeUtility.getuniformtime(updatetimes[index]) if URLStorage.storeupdatetime(params.originalurl, udpatetime): cmti = CommentInfo() Logger.getlogging().debug(contents[index]) cmti.content = str(contents[index]) comments.append(cmti) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) except: Logger.printexception()
def step2bbs(self, params): Logger.getlogging().info("BaozouNewsComments.STEP_2") topic_id = params.customized['topic_id'] commentsinfo = json.loads(params.content) comments_count = commentsinfo['total_count'] # 保存页面评论量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) for index in range(1, int(commentsinfo['total_pages']) + 1, 1): commentinfo_url = BaozouNewsComments.COMMENT_URL.format( topic_id=topic_id, page=index) self.storeurl(commentinfo_url, params.originalurl, BaozouNewsComments.STEP_3_BBS)
def step2(self, params): """获取评论的url""" try: newsId = params.customized['newsId'] jsondata = json.loads(params.content) backflag = False if jsondata: comments = [] for comment in jsondata: cmti = CommentInfo() if URLStorage.storeupdatetime(params.originalurl, str(comment['commentTime'])): cmti.content = comment['commentContent'] cmti.commentid = comment['commentId'] comments.append(cmti) else: backflag = True self.commentstorage.store(params.originalurl, comments) if backflag == False: self.commentstorage.store(params.originalurl, comments) self.pageno += 1 comment_url = self.COMMENTS_URL.format( self.pageno, self.page_size, newsId) self.storeurl(comment_url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE, {'newsId': newsId}) except: Logger.printexception()
def geturlcomments(self, params): # 获取具体评论 xparser = XPathUtility(params.content) comments_xpath = xparser.xpath('//*[@id="short_comment_content"]') if not comments_xpath: return # 获取发布时间 ip_pubtimes_xpath = xparser.getlist('//*[@id="short_comment_left"]') if len(comments_xpath) == len(ip_pubtimes_xpath): comments = [] # 获取评论 for index in range(0, len(comments_xpath), 1): cmti = CommentInfo() publicTime = ip_pubtimes_xpath[index] if self.r.search(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime): publicTime = '20' + self.r.parse(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime)[0] if self.r.search(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime): publicTime = self.r.parse(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime)[0] if URLStorage.storeupdatetime(params.originalurl, getuniformtime(publicTime)): # 获取增加的评论(根据时间比较) cmti.content = comments_xpath[index].text comments.append(cmti)
def step3(self, params): jsondata = json.loads(params.content) comments = [] for comment in jsondata: cmti = CommentInfo() curcomtime = int(comment['created']) # 检查是否需要更新当前抓取的评论的最新时间,第一条评论时间就是最新评论时间 if URLStorage.storeupdatetime( params.originalurl, TimeUtility.getuniformdate2(curcomtime)): cmti.content = comment['contents'] comments.append(cmti) # 检查是否有评论回复 if int(comment['comment_reply_total']) > 0: reply = comment['reply'] # 获取所有的评论回复 for num in range(0, int(comment['comment_reply_total']), 1): recmti = CommentInfo() recmti.content = reply[num]['contents'] comments.append(recmti) if len(comments) >= 0: # 保存获取的评论 self.commentstorage.store(params.originalurl, comments)
def copyfiles(self): # s1/s2输入路径 s1file = SpiderConfigure.getinstance().gets1file() s2file = SpiderConfigure.getinstance().gets2file() # s1/s2历史路径 self.conf.setchannel(SPIDER_CHANNEL_S1) s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) if FileUtility.exists(s1file): lines = 0 firstline = True with open(s1file, 'r') as fp: for line in fp.readlines(): line = line.strip() if firstline: firstline = False if line[:3] == codecs.BOM_UTF8: Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file)) line = line[3:] if line: lines += 1 SpiderReport.puts1url(line) if lines > 0: FileUtility.copy(s1file, s1tempfile) SpiderReport.update(SPIDER_CHANNEL_S1, '', SpiderReport.URL_UPLOAD, lines) if FileUtility.exists(s2file): FileUtility.copy(s2file, s2temppath)
def step2(self, params): """""" print params.content try: jsondata = json.loads(params.content) comments_total = int(jsondata['comments_total']) comments_data = jsondata['comments'] except: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=params.originalurl)) return #cmtnum = URLStorage.getcmtnum(params.originalurl) #if cmtnum >= comments_total: #return #URLStorage.setcmtnum(params.originalurl, comments_total) comments = [] for comment in comments_data: cmti = CommentInfo() cmti.content = comment['txtcontent'] tm = comment['addtime'] if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) if len(comments) > 0: # 保存获取的评论 self.commentstorage.store(params.originalurl, comments) self.post_data['p'] = str(int(self.data['p'] + self.page_size)) self.post_data['t'] = TimeUtility.getuniformdate(tm, '%Y-%m-%d+%H%M%S') self.storeposturl(self.post_url, params.originalurl, self.STEP_2, self.post_data)
def step3(self, params): Logger.getlogging().info("Flash8Comments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 page = params.customized['page'] xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments('//td[@class="t_f"]') #commentstime = self.r.parse(ur'发表于 (\d+-\d+-\d+ \d+:\d+)</em>', params.content) commentstime = xparser.getcomments('//div[@class="authi"]/em') comments = [] # 获取评论 # 设置实际的评论量 if page is 1: statrIndex = 1 else: statrIndex = 0 for index in range(statrIndex, len(commentstime), 1): cmti = CommentInfo() if URLStorage.storeupdatetime(params.originalurl, commentstime[index]): # 获取增加的评论(根据时间比较) cmti.content = commentsinfo[index] comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def process(self, params): try: if params.step is AllComments.STEP_1: key = int(re.findall("\d+", params.url.split("/")[-1])[0]) comments_url = AllComments.COMMENTS_URL % (key) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'key': key}) elif params.step is AllComments.STEP_2: jsoncontent = self.r.parse('data\((.*?)\)', params.content)[0] comments = json.loads(jsoncontent) pcontent = [] ptime = [] index = 0 for index in range(0, len(comments['comments'])): pcontent.append( comments['comments'][index]['comment_content']) ptime.append(comments['comments'][index]['comment_date']) dataresult = {} for i in range(len(pcontent)): dataresult[ptime[i]] = pcontent[i] comments = [] dataresult = sorted(dataresult.iteritems(), key=lambda dataresult: dataresult[0], reverse=True) for k in range(0, len(dataresult)): if URLStorage.storeupdatetime(params.originalurl, dataresult[k][0]): cmti = CommentInfo() cmti.content = dataresult[k][1] comments.append(cmti) self.commentstorage.store(params.originalurl, comments) except Exception, e: traceback.print_exc() Logger.getlogging().error(e.message)
def process(self, params): try: if params.step is AllComments.STEP_1: try: threadid = self.r.parse('data-thread-key=\"(.*?)\"',params.content)[0]; comments_url = AllComments.COMMENTS_URL % (threadid, 1) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'threadid':threadid,'pageno':1}) except: return elif params.step is AllComments.STEP_2: try: comments = json.loads(params.content) pagetotal= int(comments['cursor']['pages']) comments_url = AllComments.COMMENTS_URL % (params.customized['threadid'],params.customized['pageno']) self.storeurl(comments_url, params.originalurl, AllComments.STEP_3, {'threadid':params.customized['threadid'], 'pageno':params.customized['pageno'], 'totalpage':pagetotal}) except: return elif params.step is AllComments.STEP_3: try: if params.customized['pageno']<=params.customized['totalpage']: comments = json.loads(params.content) roll=len(comments['response']) ptimer=[] pcontent=[] for key in comments['parentPosts'].keys(): ptime = comments['parentPosts'][key]['created_at'] ptime = ptime.split("+")[0] ptime = ptime.replace("T"," ") ptimer.append(datetime.datetime.strptime(ptime,'%Y-%m-%d %H:%M:%S')) pcontent.append(comments['parentPosts'][key]['message']) for ctime in range(0,len(ptimer)): ptimer[ctime]=datetime.datetime.strptime(str(ptimer[ctime]),'%Y-%m-%d %H:%M:%S') index=0 comments = [] complete = False for comment in pcontent: cmti = CommentInfo() cmti.content = comment if URLStorage.storeupdatetime(params.originalurl, str(ptimer[index])): comments.append(cmti) else: complete = True break; index =index+ 1 self.commentstorage.store(params.originalurl, comments) if not complete: comments_url = AllComments.COMMENTS_URL % (params.customized['threadid'], params.customized['pageno']+1) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'threadid':params.customized['threadid'], 'pageno':params.customized['pageno']+1, 'totalpage':params.customized['totalpage']}) except: return except Exception, e: traceback.print_exc() Logger.getlogging().error(e.message)
def step2_2(self, params): """""" try: jsondata = json.loads(params.content) data = jsondata['data'] soup = BeautifulSoup(data, 'html5lib') divs = soup.select('.comment') except: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=params.originalurl)) return #comments_total = len(divs) #cmtnum = URLStorage.getcmtnum(params.originalurl) #if cmtnum >= comments_total: #return #URLStorage.setcmtnum(params.originalurl, comments_total) comments = [] #divs.reverse() for div in divs: cmti = CommentInfo() cmti.content = div.find(attrs={ 'style': re.compile('padding-top') }).get_text().strip() tm = div.select_one('.show-time').get_text() tm = getuniformtime(tm) if not tm: continue if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) if len(comments) > 0: # 保存获取的评论 self.commentstorage.store(params.originalurl, comments)
def step3bbs(self, params): Logger.getlogging().info("Tmtpostcommnets.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 commentsinfo = json.loads(params.content) comments = [] #for index in range(0, int(len(commentsinfo['data'])), 1): ## 提取时间 #cmti = CommentInfo() #cmti.content = commentsinfo['data'][index]['comment'] #tm = TimeUtility.getuniformtime(commentsinfo['data'][index]['time_created'], u'%Y-%m-%d %H:%M') #if URLStorage.storeupdatetime(params.originalurl, tm): #comments.append(cmti) jsondata = commentsinfo['data'] if not jsondata: return for data in jsondata: cmti = CommentInfo() cmti.content = data['comment'] tm = gettimeutil.getuniformtime(data['time_created']) if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def process(self, params): try: if params.step is AllComments.STEP_1: aid = re.findall("\d+", params.url.split("/")[-1])[0] aid_url = AllComments.AID_URL % (aid) self.storeurl(aid_url, params.originalurl, AllComments.STEP_2, {'aid': aid}) elif params.step is AllComments.STEP_2: cms_id = re.findall('appidArr \= \[\"cms\|(.+?)",', str(params.content))[0] cms_url = AllComments.KEYID_URL % ( cms_id, params.customized['aid'], params.originalurl) self.storeurl(cms_url, params.originalurl, AllComments.STEP_3, { 'aid': params.customized['aid'], 'cmsid': cms_id }) elif params.step is AllComments.STEP_3: comments = json.loads(params.content) sid = comments['data']['_id'] comment_url = AllComments.COMMENTS_URL % ( sid, '1', params.customized['cmsid']) self.storeurl(comment_url, params.originalurl, AllComments.STEP_4, { 'sid': sid, 'page': '1', 'cmsid': params.customized['cmsid'] }) elif params.step is AllComments.STEP_4: comments = json.loads(params.content) try: comment = [] index = 0 for index in range(0, len(comments['data'])): ctime = TimeUtility.getuniformtime2( comments['data'][index]['ctime']) if URLStorage.storeupdatetime(params.originalurl, str(ctime)): cmti = CommentInfo() cmti.content = comments['data'][index]['content'] comment.append(cmti) self.commentstorage.store(params.originalurl, comment) comment_url = AllComments.COMMENTS_URL % ( params.customized['sid'], str(int(params.customized['page']) + 1), params.customized['cmsid']) self.storeurl( comment_url, params.originalurl, AllComments.STEP_4, { 'sid': params.customized['sid'], 'page': str(int(params.customized['page']) + 1), 'cmsid': params.customized['cmsid'] }) except: return except Exception, e: traceback.print_exc() Logger.getlogging().error(e.message)
def processfile(self, jsonfile): if not self.preprocess(jsonfile): return post = (constant.POST_FILE_SUFFIX in jsonfile) urls = self.backupfile(jsonfile) context = URLStorage.getfilecontext(FileUtility.getfilename(jsonfile)) with open(jsonfile, 'r') as fp: lines = fp.readlines() for line in lines: param = self.analysis(line, post) if param is None: continue url = param.url if context.retry >= 2: param.lastretry = True if post: url = json.dumps({'url': param.url, 'data': param.data}) else: Logger.getlogging().warning(url) info = None if URLStorage.hasurl(url): info = URLStorage.geturlcontext(url) param.originalurl = info.originalurl param.step = info.step param.customized = info.customized else: param.originalurl = param.url res = True if SiteS2Query.REFER_URL in param.customized: site = self.factory.getsite(param.customized[SiteS2Query.REFER_URL]) res = site.process(param) else: site = self.factory.getsite(param.originalurl) res = site.process(param) if not res: if info: URLStorage.seturlcontext(param.url, info) else: if url in urls: urls[url] -= 1 if urls[url] == 0: urls.pop(url) # upload failed urls if urls: self.retrydownload(jsonfile, urls)
def step2(self, params): Logger.getlogging().info("ThirtysixKryptonComments.STEP_2") # 将STEP_1中的cid传下来 cid = params.customized['cid'] jsoncontent = json.loads(params.content) comments_count = jsoncontent['data']['total_items'] page_count = jsoncontent['data']['total_pages'] # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) for page in range(1, page_count+1, 1): commentinfo_url = ThirtysixKryptonComments.COMMENT_URL.format(cid, self.page_size, page) self.storeurl(commentinfo_url, params.originalurl, ThirtysixKryptonComments.STEP_3)
def process(self, params): try: if params.step is ChinabyteComments.STEP_1: threadid = self.r.parse('data-thread-key=\"(.*?)\"', params.content) if not threadid: return comments_url = ChinabyteComments.COMMENTS_URL % (threadid[0], 1) self.storeurl(comments_url, params.originalurl, ChinabyteComments.STEP_2, { 'threadid': threadid[0], 'pageno': 1 }) elif params.step == ChinabyteComments.STEP_2: try: threadid = params.customized['threadid'] comments = json.loads(params.content) pagetotal = int(comments['cursor']['pages']) except: Logger.getlogging().warning('{0}:30000'.format( params.originalurl)) return #threadid = params.customized['threadid'] #comments = json.loads(params.content) #pagetotal= int(comments['cursor']['pages']) # pages==0的场合,没有评论 if pagetotal == 0: return for page in range(1, pagetotal + 1, 1): comments_url = ChinabyteComments.COMMENTS_URL % (threadid, page) self.storeurl(comments_url, params.originalurl, ChinabyteComments.STEP_3) # comments_url = ChinabyteComments.COMMENTS_URL % (params.customized['threadid'],params.customized['pageno']) # self.storeurl(comments_url, params.originalurl, ChinabyteComments.STEP_3, # {'threadid':params.customized['threadid'], # 'pageno':params.customized['pageno'], # 'totalpage':pagetotal}) # elif params.step == ChinabyteComments.STEP_3: comments = [] commentinfo = json.loads(params.content) for key in commentinfo['parentPosts'].keys(): updatetime = getuniformtime( commentinfo['parentPosts'][key]['created_at']) if URLStorage.storeupdatetime(params.originalurl, updatetime): cmti = CommentInfo() cmti.content = commentinfo['parentPosts'][key][ 'message'] comments.append(cmti) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) except: Logger.printexception()
def step2bbs(self, params): Logger.getlogging().info("Tmtpostcommnets.STEP_2") tid = params.customized['tid'] commentsinfo = json.loads(params.content) comments_count = commentsinfo['cursor']['total'] # 保存页面评论量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) for index in range(0, int(math.ceil(float(comments_count) / self.limit)), 1): self.offset = index * self.limit commentinfo_url = Tmtpostcommnets.COMMENT_URL.format( tid=tid, limit=self.limit, offset=self.offset) self.storeurl(commentinfo_url, params.originalurl, Tmtpostcommnets.STEP_3_BBS)
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is BaozouVideoComments.STEP_1: # Step1: 通过原始url得到moveid,得到获取评论的首页url。 Logger.getlogging().info("proparam.step is None") article_id = int(self.r.parse(r'^http://baozou\.com/\w+/(\d+).*', proparam.url)[0]) Logger.getlogging().debug(article_id) commentinfo_url = BaozouVideoComments.COMMENTS_URL % (article_id,1,self.per_page) self.storeurl(commentinfo_url, proparam.originalurl, BaozouVideoComments.STEP_2,{'article_id' : article_id}) elif proparam.step == BaozouVideoComments.STEP_2: # Step2: 通过Step1设置url,得到评论的总数和最后一次评论时间,并根据评论总数得到获取其他评论的url。 Logger.getlogging().info("proparam.step == 2") article_id = proparam.customized['article_id'] commentsinfo = json.loads(proparam.content) #print commentsinfo comments_count = int(commentsinfo['total_entries']) #print comments_count Logger.getlogging().debug('{url} comment: {ct}'.format(url = proparam.url, ct = comments_count)) #page = commentsinfo['total_pages'] #print page if comments_count == 0: return # 拼出获取评论的URL并保存 for page in range(1, int(math.ceil(float(comments_count) / self.per_page)) + 1, 1): comment_url = BaozouVideoComments.COMMENTS_URL % (article_id,page,self.per_page) self.storeurl(comment_url, proparam.originalurl, BaozouVideoComments.STEP_3) elif proparam.step == BaozouVideoComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 Logger.getlogging().info("proparam.step == 3") commentsinfo = json.loads(proparam.content) contents = commentsinfo['comments'] commentsarr = [] for content in contents: cmti = CommentInfo() tm = TimeUtility.getuniformtime(content['created_at'], '%Y-%m-%d %H:%M:%S') if URLStorage.storeupdatetime(proparam.originalurl, tm): cmti.content = content['content'] commentsarr.append(cmti) # 保存获取的评论 if len(commentsarr) > 0: self.commentstorage.store(proparam.originalurl, commentsarr) else: Logger.getlogging().error('proparam.step == {step}'.format(step=proparam.step)) except Exception, e: traceback.print_exc()
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is Dm123BbsComments.STEP_1: xparser = XPathUtility(params.content) #通过第一次传进来的URL判断是否有后续页面 keyvalue = self.r.parse('tid-(.*?).html', params.url)[0] pagecount = xparser.getnumber( '//*[@class="pages"]/div[@class="fl"]') commentinfo_url = params.url self.storeurl(commentinfo_url, params.originalurl, Dm123BbsComments.STEP_2, { 'keyvalue': keyvalue, 'totalpage': pagecount, 'curpage': 1 }) elif params.step == Dm123BbsComments.STEP_2: keyvalue = params.customized['keyvalue'] curpage = params.customized['curpage'] xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments( '//div[contains(@class,"tpc_content")]') commentstime = self.r.parse(ur'\"(\d+-\d+-\d+ \d+:\d+)\">发表于:', params.content) comments = [] for index in range(0, len(commentstime)): cmti = CommentInfo() if URLStorage.storeupdatetime( params.originalurl, TimeUtility.getuniformtime(commentstime[0] + ':00')): # 获取增加的评论(根据时间比较) cmti.content = commentsinfo[index] comments.append(cmti) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) nextpageList = [keyvalue, "-page-", str(curpage + 1)] nextpage = '' nextpage = nextpage.join(nextpageList) if int(nextpageList[2]) <= int(params.customized['totalpage']): comment_url = Dm123BbsComments.COMMENT_URL.format( page=nextpage) self.storeurl( comment_url, params.originalurl, Dm123BbsComments.STEP_2, { 'keyvalue': nextpageList[0], 'totalpage': params.customized['totalpage'], 'curpage': curpage + 1 }) except Exception, e: traceback.print_exc()
def step2(self, params): Logger.getlogging().info("Flash8Comments.STEP_2") # 将STEP_1中的docurl传下来 docurl = params.customized['docurl'] xparser = XPathUtility(params.content) commentsinfo = xparser.getstring('//div[@class="page"]/span/font[1]') # 保存页面评论量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= int(commentsinfo[0]): return URLStorage.setcmtnum(params.originalurl, int(commentsinfo[0])) # 总数除以page_size,然后加1,可得到评论总页数comments_count pagecount = xparser.getnumber('//*[@class="pg"]/label/span') if pagecount == 0: pagecount = pagecount + 1 for page in range(1, pagecount + 1, 1): comment_url = Flash8Comments.COMMENT_URL.format(docurl=docurl, page=page) self.storeurl(comment_url, params.originalurl, Flash8Comments.STEP_3, {'page': page})
def step2(self, params): """""" uniqid = params.customized['uniqid'] domain = params.customized['domain'] url = params.customized['url'] jsondata = json.loads(params.content) comments_count = int(jsondata['show']['total_num']) # 检查评论数是否增加,没有增加,返回;有增加,更新增加后的页面评论量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= int(comments_count): return URLStorage.setcmtnum(params.originalurl, int(comments_count)) #3. 拼出获取所有评论的url max = int( math.ceil(float(comments_count) / HuyaComments.DEFAULT_PAGE_SIZE)) for page in range(1, max + 1, 1): #num = (page - 1)*HuyaComments.DEFAULT_PAGE_SIZE comments_url = HuyaComments.COMMENTS_URL.format(uniqid=uniqid, domain=domain, url=url, page=page) self.storeurl(comments_url, params.originalurl, HuyaComments.STEP_3)
def step3bbs(self, params): Logger.getlogging().info("Chinavaluecomments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 params.content = params.content[1:len(params.content) - 1] commentsinfo = json.loads(params.content) comments_count = commentsinfo['RecordCount'] # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) comments = [] for index in range(0,len(commentsinfo['CommentObjs'])): # 提取时间 cmti = CommentInfo() cmti.content = commentsinfo['CommentObjs'][index]['Content'] tm = TimeUtility.getuniformtime(TimeUtility.getuniformtime(commentsinfo['CommentObjs'][index]['AddTime'], u'%Y-%m-%d %H:%M')) if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def retrydownload(self, jsonfile, urlset): Logger.getlogging().warning('upload failed urls {num}'.format(num=len(urlset))) context = URLStorage.getfilecontext(FileUtility.getfilename(jsonfile)) if context.retry >= 2: Logger.getlogging().error('do not upload for failed again') for key in urlset.keys(): Logger.getlogging().error('download {url} failed'.format(url=key)) else: urls = [] for key in urlset.keys(): Logger.getlogging().warning('retry download {url}'.format(url=key)) for i in range(0, urlset[key]): urls.append(key) StatisticsManager.updateall(-len(urls)) URLStorage.updaterecycle(context.retry + 1) if constant.POST_FILE_SUFFIX in jsonfile: URLStorage.storeurls(urls, constant.REQUEST_TYPE_POST) elif constant.WEBKIT_FILE_SUFFIX in jsonfile: URLStorage.storeurls(urls, constant.REQUEST_TYPE_WEBKIT) else: URLStorage.storeurls(urls, constant.REQUEST_TYPE_COMMON)
def step3bbs(self, params): Logger.getlogging().info("BaozouNewsComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 commentsinfo = json.loads(params.content) comments = [] for index in range(0, int(len(commentsinfo['comments'])), 1): # 提取时间 cmti = CommentInfo() cmti.content = commentsinfo['comments'][index]['content'] tm = commentsinfo['comments'][index]['created_at'] if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def step3(self, params): Logger.getlogging().info("ThirtysixKryptonComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 jsoncontent = json.loads(params.content) commentsInfo = [] for index in range(0, len(jsoncontent['data']['items']), 1): cmti = CommentInfo() # 提取评论内容 cmti.content = jsoncontent['data']['items'][index]['content'] # 提取时间 publicTime = jsoncontent['data']['items'][index]['created_at'] tm = TimeUtility.getuniformtime(TimeUtility.getuniformtime(publicTime, u'%Y-%m-%d %H:%M:%S')) if URLStorage.storeupdatetime(params.originalurl, tm): commentsInfo.append(cmti) if len(commentsInfo) > 0: # 保存获取的评论 self.commentstorage.store(params.originalurl, commentsInfo)
def step5bbs(self, params): # Step3: 通过Step2设置的url,得到所有评论,抽取评论 soup = BeautifulSoup(params.content, 'html.parser') commentsinfo = soup.select('.cpl_nrr2') commentstime = soup.select('.cpl_nrr1') comments = [] # 获取评论 for index in range(0, int(len(commentsinfo) - 1), 1): # 提取时间 cmti = CommentInfo() cmti.content = commentsinfo[index].get_text() publicTime = self.r.parse( ur'发表于 (.*)', commentstime[index].get_text().strip())[0] publicTime = getuniformtime(publicTime) if URLStorage.storeupdatetime(params.originalurl, publicTime): comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)