def getcomments_step3(self, params): xhtml = XPathUtility(html=params.content) contents = xhtml.getlist('//*[contains(@id,"partThreadContent")]') curtimes = xhtml.getlist('//*[@class="comment_rw"]/span/em') nicks = xhtml.getlist('//*[@class="wzbox"]/h5') for index in range(0, len(contents), 1): curtime = TimeUtility.getuniformtime(curtimes[index]+':00') content = str(contents[index]) nick = str(nicks[index]) if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def getcomments_step3(self, params): xhtml = XPathUtility(html=params.content) contents = xhtml.getlist('//*[@class="wz"]/p') curtimes = xhtml.getlist('//*[@class="fr"]') nicks = xhtml.getlist('//*[@class="wzbox"]/h5') for index in range(0, len(contents), 1): curtime = curtimes[index][4:] + ':00' Logger.getlogging().debug(contents[index]) content = str(contents[index]) nick = str(nicks[index]) if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def getComments(self, params): xhtml = XPathUtility(html=params.content) commentinfo = xhtml.getlist('//*[@class="recTxt"]') updatetimes = xhtml.getlist('//*[@class="comment-time"]') comments = [] for index in range(0, commentinfo.__len__()): curtime = TimeUtility.getuniformtime(updatetimes[index][1:updatetimes[index].__len__() - 1]) content = commentinfo[index] nick = 'nick' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def process(self, params): try: if params.step is None: # 根据html内容获取评论总数 xhtml = XPathUtility(html=params.content) countsStr = str( xhtml.getstring('//*[@id="chartForm"]/div[1]/a[3]')) startpos = countsStr.find('(') if startpos < 0: Logger.getlogging().error(params.originalurl) return comment_counts = int(countsStr[startpos + 1:countsStr.find(')')]) Logger.getlogging().debug(comment_counts) if comment_counts == 0: return # 比较上次抓取该url的页面评论量和当前取到的评论量 # # 循环拼接评论url,提交下载平台获取评论数据 for page in range( 1, int( math.ceil(comment_counts / Cine107Comments.PAGE_SIZE)) + 1, 1): commentUrl = Cine107Comments.COMMENTS_URL.format( url=params.originalurl, pageno=page) Logger.getlogging().debug(commentUrl) self.storeurl(commentUrl, params.originalurl, Cine107Comments.STEP_2) URLStorage.setcmtnum(params.originalurl, comment_counts) #解析评论数据 elif params.step == Cine107Comments.STEP_2: xhtml = XPathUtility(html=params.content) comments = [] contents = xhtml.getlist( '//*[@class="flow_commont_list clearfix"]/p') updatetimes = xhtml.getlist('//*/time') for index in range(0, len(contents), 1): udpatetime = TimeUtility.getuniformtime(updatetimes[index]) if URLStorage.storeupdatetime(params.originalurl, udpatetime): cmti = CommentInfo() Logger.getlogging().debug(contents[index]) cmti.content = str(contents[index]) comments.append(cmti) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) except: Logger.printexception()
def getpagecomments(self, params): info = params.customized['query'] xpath = XPathUtility(html=params.content) hrefs = xpath.xpath('//*[@class="sosResult"]/strong/a/@href') titles = xpath.getlist('//*[@class="sosResult"]/strong/a') pubtimes = xpath.xpath('//*[@class="sosResult"]/span/cite[3]') today = datetime.datetime.strptime( TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT).date() urllist = [] for index in range(0, len(titles), 1): # 标题中包含指定要查询的关键字 # if titles[index].find(info) > -1: if Common.checktitle(info, titles[index]): pubtimestr = TimeUtility.getuniformtime( pubtimes[index].text).split(' ')[0] pubtime = datetime.datetime.strptime( pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date() # pubtime = datetime.datetime.strptime(pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT) inteveral = today - pubtime # 时间在指定周期内 if inteveral.days <= int(self.querylastdays): newurl = self.preprocess(hrefs[index]) if newurl is not None: urllist.append(newurl) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def bbs_step3(self, params): try: xparser = XPathUtility(params.content) page = params.customized['page'] pagecount = params.customized['pagecount'] comments = [] updatetimes = [] nicks = [] contents = xparser.getcomments('//*[@class="read"]') mid_times = xparser.getlist('//td[@class="authorname"]') for times in mid_times: updatetimes.append(self.r.parse(ur'于(\d+-\d+-\d+ \d+:\d+:\d+)留言', times)[0]) nicks.append(self.r.parse(ur'(.*)于', times)[0]) if page == 0: mid_index = 1 elif page > 0: mid_index = 0 comments_number = xparser.getnumber('//*[@id="msgsubject"]/font') if comments_number != 0: for index in range(mid_index, len(contents), 1): curtime = TimeUtility.getuniformtime(updatetimes[index]) content = contents[index] nick = nicks[index].split('于')[0].split('☆')[-1] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: traceback.print_exc()
def step2(self, params): Logger.getlogging().info("LaohuComments.STEP_2") token = params.customized['token'] sourceId = params.customized['sourceId'] xhtml = XPathUtility(html=params.content) # 网友评论(32) countstr = xhtml.getlist('//*[@class="filter-by-type"]')[0] comment_counts = int(countstr[5:countstr.__len__()-1]) if comment_counts: NewsStorage.setcmtnum(params.originalurl, comment_counts) if comment_counts == 0: Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl)) return cmtnum = CMTStorage.getcount(params.originalurl, True) # 判断增量 if cmtnum >= comment_counts: #Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl)) return page_num = int(math.ceil(float(comment_counts - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 获取第一页评论内容 self.getComments(params) if comment_counts > 15: # 循环拼接评论url,提交下载平台获取评论数据 COMMENTS_URL = 'http://member.laohu.com/comment/ajax?page=%d&token=%s&order=new' for page in range(2, page_num + 1, 1): commentUrl = LaohuComments.COMMENTS_URL % (page, sourceId) self.storeurl(commentUrl, params.originalurl, LaohuComments.STEP_3, {'token' : token, 'sourceId':sourceId})
def geturlcomments(self, params): # 获取具体评论 xparser = XPathUtility(params.content) comments_xpath = xparser.xpath('//*[@id="short_comment_content"]') if not comments_xpath: return # 获取发布时间 ip_pubtimes_xpath = xparser.getlist('//*[@id="short_comment_left"]') if len(comments_xpath) == len(ip_pubtimes_xpath): comments = [] # 获取评论 for index in range(0, len(comments_xpath), 1): cmti = CommentInfo() publicTime = ip_pubtimes_xpath[index] if self.r.search(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime): publicTime = '20' + self.r.parse(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime)[0] if self.r.search(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime): publicTime = self.r.parse(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime)[0] if URLStorage.storeupdatetime(params.originalurl, getuniformtime(publicTime)): # 获取增加的评论(根据时间比较) cmti.content = comments_xpath[index].text comments.append(cmti)
def process(self, params): # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL if params.step == LaohuS2Query.LAOHU_S2QUERY_FIRST_PAGE: # 获得首页url参数 KEY = params.customized['KEY'] time = params.customized['time'] #获取总页数 xparser = XPathUtility(params.content) pageCounts = xparser.getlist('//*[@id="main"]/div[2]/span') if len(pageCounts) > 0: page = str(pageCounts[0]).split('/')[1] #获取第一页的搜索结果 self.pageprocess(params) if int(page) > 1: if int(page) >= self.maxpages: page = self.maxpages querylist = [] # 根据总页数,获取query列表(第一页的数据已经获取到了,从第二页开始拼出获取的url) for pages in range(2, int(page) + 1, 1): url = LaohuS2Query.LAOHU_QUERY_TEMPLATE.format( KEY=KEY, pn=pages, time=time) querylist.append(url) self.__storeqeuryurllist__( querylist, LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE, {'KEY': KEY}) else: Logger.getlogging().debug('抱歉,没有找到与' + ' ' + KEY + ' ' + '相关的帖子') # 从查询页面中获取视频URL elif params.step == LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE: self.pageprocess(params)
def getsearchresult(self, params): info = params.customized['query'] xpath = XPathUtility(html=params.content) hrefs = xpath.xpath('//li/h3/a/@href') titles = xpath.getlist('//li/h3/a') pubtimes = xpath.xpath('//li/p') today = datetime.datetime.strptime( TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT).date() urllist = [] for index in range(0, len(titles), 1): # 标题中包含指定要查询的关键字 # if titles[index].find(info) > -1: if Common.checktitle(info, titles[index]): pubtimestr = TimeUtility.getuniformdate(pubtimes[index].text) pubtime = datetime.datetime.strptime( pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date() inteveral = today - pubtime # 时间在指定周期内 if inteveral.days <= self.querylastdays: urllist.append(hrefs[index]) else: # 因为是按照时间排序的,第一条时间不满足检索周期的话,后面所有的都不满足。 break if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def pageprocess(self, params): # 获取文本 xparser = XPathUtility(params.content) # 获取该页超级链接 hreflist = xparser.xpath('//h3/a/@href') hrefs = [] for mid_url in hreflist: mid = self.preprocess(mid_url) if mid is not None: hrefs.append(mid) # 获取该页内容的所有发布时间 publictime = xparser.xpath('//*[@class="scontent"]/text()[1]') publicTimes = [] for timeindex in publictime: middle = str(timeindex).replace('\n', '').replace('\t', '').strip() publicTimes.append( str(str(middle).split(' ')[0]) + ' ' + str(str(middle).split(' ')[1])) # 获取改页所有title titles = [] titles_list = xparser.getlist('//h3') for title in titles_list: mid_title = str(title).replace('\n', '').replace('\t', '').strip() titles.append(mid_title) # 获取关键字 KEY_mid = params.customized['KEY'] KEY = Common.urldec(KEY_mid) # 获取标题正则表达式 titlePatten = KEY # 获取一周前日期 today = datetime.datetime.now() before_days = today + datetime.timedelta(-self.inputtime) before_arr = str(before_days).split('.') before_time = before_arr[0] urllist = [] len_hrefs = len(hrefs) number = 0 for index in publicTimes[:len_hrefs]: # 是否是标题命中 # mid_value = re.compile(titlePatten) # flg = mid_value.search(str(titles[number])) flg = Common.checktitle(titlePatten, str(titles[number])) # 是当前一周内发布视频,并且标题命中的场合 if index > before_time and flg: url = hrefs[number] urllist.append(url) number = number + 1 # 获取最终url列表 if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def step1(self, params): Logger.getlogging().info("LaohuComments.STEP_1") #1. 根据输入原始url, 得到网站的子域名 field = self.r.parse('^http://(\w+)\.laohu\.com/.*', params.originalurl)[0] # 论坛 if field == 'bbs': # 拼接获取uniqid的url self.storeurl(params.originalurl, params.originalurl, LaohuComments.STEP_2_BBS,{'field': field}) else: # 非论坛页面 http://ff.laohu.com/201612/215072.html xhtml = XPathUtility(html=params.content) token = xhtml.getlist('// *[ @ id = "t_token"]')[0] sourceId = self.r.getid('source_id', params.content, '\s*=\s*') # 拼接第一页评论url COMMENTS_URL = 'http://member.laohu.com/comment/show/?token=%s&oder=new' comments_url = LaohuComments.COMMENTS_URL % (token) # 通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, LaohuComments.STEP_2, {'token' : token, 'sourceId':sourceId})
def download(self): doneurl = TencentDownloader.DONE_FILE_URL.format( taskid=self.taskinfo.taskid) html = TencentDownloader.httpget(doneurl) if html: xparse = XPathUtility(html) for donefile in xparse.getlist(r'//tr/td[2]/a'): if donefile.endswith( 'done') and donefile not in self.downloadedfiles: for upfile in self.upload_file_list: if donefile.startswith(upfile): FileUtility.mkdirs(self.download_path) self.execute( TencentDownloader.DOWNLOAD_COMMAND.format( taskid=self.taskinfo.taskid, filename=donefile)) FileUtility.move('./' + donefile, self.download_path) break self.downloadedfiles.append(donefile) return tencentdownloader.TencentDownloader.download(self)
def step2news(self, params): Logger.getlogging().info("ZolbbsComments.STEP_2") kindid = params.customized['kindid'] docurl = params.customized['docurl'] xparser = XPathUtility(params.content) comments_count = int(xparser.getlist('//*[@class="comment-num"]')[0]) # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return NewsStorage.setcmtnum(params.originalurl, comments_count) page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages for page in range(1, page_num + 1, 1): comment_url = self.COMMENT_URL_NEWS.format(kindid=kindid, docurl=docurl, page=page) self.storeurl(comment_url, params.originalurl, ZolnewsComments.STEP_3)
def geturlcomments(self, params): # 获取具体评论 xparser = XPathUtility(params.content) comments_xpath = xparser.xpath('//*[contains(@id, "cm_")]') if not comments_xpath: return # 获取发布时间 ip_pubtimes_xpath = xparser.getlist('//*[contains(@id,"CList___CommentList_UserLink_")]/..') if len(comments_xpath) == len(ip_pubtimes_xpath): comments = [] # 获取评论 for index in range(0, len(comments_xpath), 1): cmti = CommentInfo() if URLStorage.storeupdatetime(params.originalurl, getuniformtime(ip_pubtimes_xpath[index])): # 获取增加的评论(根据时间比较) cmti.content = comments_xpath[index].text comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is Rain8Comments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url参数。 articleId = self.r.parse('http://\w+\.tadu\.com/\w+/(\d+).*', params.originalurl)[0] # 取得评论的url列表 comments_url = Rain8Comments.COMMENT_URL.format (articleId = articleId,page = 1) self.storeurl(comments_url, params.originalurl, Rain8Comments.STEP_2, {'articleId': articleId}) elif params.step == Rain8Comments.STEP_2: # 获得评论参数 articleId = params.customized['articleId'] # 取得总件数 #comment_count = float(self.r.getid('total', params.content)) xparser = XPathUtility(params.content) countstr = xparser.getstring('//h4') if self.r.search(u'\d+', countstr): comment_count = self.r.parse(u'(\d+)', countstr)[1] if comment_count == 0: return # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comment_count: return URLStorage.setcmtnum(params.originalurl, comment_count) # 获取页数 totalPage = int(math.ceil(float(comment_count) / TaDuComments.PAGE_SIZE)) # 获得url列表 for page in range(1, totalPage+1 , 1): url = TaDuComments.COMMENT_URL.format(articleId = articleId,page = page) self.storeurl(url, params.originalurl, TaDuComments.STEP_3) elif params.step == TaDuComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 Logger.getlogging().info("params.step == 3") # 取得所有评论 xparser = XPathUtility(params.content) comments = xparser.getlist('//ul[@class="cmetlist bookreview-cmetlist"]/li/div/div[2]/p') # 取得所有评论时间 commenttimes = xparser.getlist('//ul[@class="cmetlist bookreview-cmetlist"]/li/div/div[2]/span') commentsInfo = [] # 取得所有评论 for index in range(0, int(len(comments)), 1): # 提取时间 publicTime = commenttimes[index][3:] cmti = CommentInfo() tm = TimeUtility.getuniformtime(publicTime,'%Y-%m-%d %H:%M') if URLStorage.storeupdatetime(params.originalurl, tm): cmti.content = comments[index].strip() commentsInfo.append(cmti) # 保存获取的评论 if len(commentsInfo) > 0: self.commentstorage.store(params.originalurl, commentsInfo) else: Logger.getlogging().error('proparam.step == {step}'.format(step = params.step)) except Exception,e: traceback.print_exc()
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is ishangmanComments.STEP_1: # 取得url中的参数值 articleIds = re.findall( r'^http://(\w+)\.ishangman\.com/\w+/(\d+)', proparam.url).__getitem__(0) articleId1 = articleIds.__getitem__(0) articleId2 = articleIds.__getitem__(1) # 评论类型 commenttype = int( self.r.parse(ur'commenttype = (.*);', proparam.content)[0]) #第一页评论 url = ishangmanComments.COMMENTS_URL % (articleId1, articleId2, commenttype, 1) self.storeurl( url, proparam.originalurl, ishangmanComments.STEP_2, { 'articleId1': articleId1, 'articleId2': articleId2, 'commenttype': commenttype }) elif proparam.step == ishangmanComments.STEP_2: articleId1 = proparam.customized['articleId1'] articleId2 = proparam.customized['articleId2'] commenttype = proparam.customized['commenttype'] # 取得评论件数 xhtml = XPathUtility(html=proparam.content) if articleId1.__eq__('comic'): comments_count = int( xhtml.getlist( '//*[contains(@class,"ismcartondiv1")]/p/strong') [0]) if comments_count: NewsStorage.setcmtnum(proparam.originalurl, comments_count) else: comments_count = int( self.r.parse( ur'(\d+).*', xhtml.getlist('//*[@class="comment_lctwidl"]/p') [0])[0]) if comments_count: NewsStorage.setcmtnum(proparam.originalurl, comments_count) # 取得评论的页数 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if int(comments_count) == 0: return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 取得评论的url for page in range(1, page_num + 1, 1): url = ishangmanComments.COMMENTS_URL % ( articleId1, articleId2, commenttype, page) self.storeurl(url, proparam.originalurl, ishangmanComments.STEP_3, {'articleId1': articleId1}) elif proparam.step == ishangmanComments.STEP_3: try: Logger.getlogging().debug(proparam.originalurl) commentsInfo = [] articleId1 = proparam.customized['articleId1'] xparser = XPathUtility(proparam.content) # 取得评论件数 if articleId1.__eq__('comic'): # 论坛评论 soup = BeautifulSoup(proparam.content, 'html5lib') comments = soup.select('.ismcartondiv2') else: # 论坛评论 comments = xparser.getcomments( '/html/body/div/span[2]/p[1]') # 论坛评论时间 updateTime = xparser.getcomments( '/html/body/div/span[2]/div[1]') # 取得评论 for index in range(0, int(len(comments)), 1): cmti = [] if articleId1.__eq__('comic'): publictime = self.r.parse( ur'(\d{2}-\d+ \d+:\d+)', comments[index].get_text())[0] # publictime = TimeUtility.getuniformtime(publictime) if publictime: cmt_month = publictime.split("-")[0] curmonth = time.localtime().tm_mon if (int(cmt_month) < curmonth): publictime = TimeUtility.getcurrentdate( )[0:4] + '-' + publictime else: publictime = '2016' + '-' + publictime curtime = TimeUtility.getuniformtime(publictime) content = comments[index].text.split( '\n')[0].get_text() # # print comments; # return # content = self.r.parse(ur'class=\".*\"',comments[index].get_text())[0] # nick = comments[1].get('nickname', 'anonymous') # # if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): # CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) # if NewsStorage.storeupdatetime(proparam.originalurl, tm): # cmti.content = comments[index].get_text() # commentsInfo.append(cmti) else: publictime = updateTime[index][:-8] #publictime = TimeUtility.getcurrentdate()[0:4] + '-'+ publictime #tm = TimeUtility.getuniformtime(publictime, u'%Y-%m-%d %H:%M') tm = getuniformtime(publictime) if NewsStorage.storeupdatetime( proparam.originalurl, tm): cmti.content = comments[index] commentsInfo.append(cmti) # 保存获取的评论i if len(commentsInfo) > 0: self.commentstorage.store(proparam.originalurl, commentsInfo) except: Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format( site=proparam.url)) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()