def getpagecomments(self, params): info = params.customized['query'] xpath = XPathUtility(html=params.content) hrefs = xpath.xpath('//*[@class="sosResult"]/strong/a/@href') titles = xpath.getlist('//*[@class="sosResult"]/strong/a') pubtimes = xpath.xpath('//*[@class="sosResult"]/span/cite[3]') today = datetime.datetime.strptime( TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT).date() urllist = [] for index in range(0, len(titles), 1): # 标题中包含指定要查询的关键字 # if titles[index].find(info) > -1: if Common.checktitle(info, titles[index]): pubtimestr = TimeUtility.getuniformtime( pubtimes[index].text).split(' ')[0] pubtime = datetime.datetime.strptime( pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date() # pubtime = datetime.datetime.strptime(pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT) inteveral = today - pubtime # 时间在指定周期内 if inteveral.days <= int(self.querylastdays): newurl = self.preprocess(hrefs[index]) if newurl is not None: urllist.append(newurl) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def process(self, params): try: if params.step is AllComments.STEP_1: try: action = self.r.parse('\"id\":(\d+)',params.content)[0] #action = action.replace("rewards","comments") comments_url = AllComments.COMMENTS_URL.format(id=action,page=1) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'action':action,'page':1}) except: return elif params.step is AllComments.STEP_2: soup = BeautifulSoup(params.content,'html5lib') divs = soup.find_all(attrs={'id':re.compile('comment'),'class':re.compile('note-comment')}) if not divs: return for div in divs: #print ''.join(div.get_text().split()) content = div.select_one('div.content > p').get_text() pubtime = div.select_one('div.content > .meta-top > .reply-time > a').get_text() curtime = TimeUtility.getuniformtime(pubtime) if not CMTStorage.exist(params.originalurl, content, curtime, ''): CMTStorage.storecmt(params.originalurl, content, curtime, '') child_divs = div.select('.content > .child-comment-list > .child-comment') if child_divs: #child_divs = soup.find_all(attrs={'id':re.compile('comment')}) for item in child_divs: comment = item.select_one('p').get_text() content = ''.join(comment.split()) replytime = item.select_one('.reply-time > a').get_text() curtime = TimeUtility.getuniformtime(replytime) if not CMTStorage.exist(params.originalurl, content, curtime, ''): CMTStorage.storecmt(params.originalurl, content, curtime, '') # self.commentstorage.store(params.originalurl, comments) comments_url = AllComments.COMMENTS_URL.format(id=params.customized['action'], page = int(params.customized['page'])+1) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'action':params.customized['action'], 'page':int(params.customized['page'])+1}) except: Logger.printexception()
def step2(self, params): Logger.getlogging().info("MkzhanComments.STEP_2") comic_id = params.customized['comic_id'] # aboutid = params.customized['aboutid'] comments = json.loads(params.content) comments_count = int(comments['data']['count']) cmtnum = CMTStorage.getcount(params.originalurl, True) # 获取第一页的内容 for it in comments['data']['list']: content = it['content'] curtime = TimeUtility.getuniformtime(it['create_time']) nick = it['username'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) # 设置cmtnum NewsStorage.setcmtnum(params.originalurl, comments_count) if cmtnum >= comments_count: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=params.originalurl)) return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages for page in range(2, page_num + 1, 1): comments_url = MkzhanComments.COMMENTS_URL % (comic_id, page, self.PAGE_SIZE) self.storeurl(comments_url, params.originalurl, MkzhanComments.STEP_3)
def setclicknum(self,params): try: jsondate = json.loads(params.content) todayplaynum = jsondate['cmtVote'] publishdate = jsondate['createTime'] NewsStorage.setclicknum(params.originalurl, todayplaynum) NewsStorage.setpublishdate(params.originalurl, TimeUtility.getuniformtime(publishdate)) except: Logger.printexception()
def step3(self, params): Logger.getlogging().info("MkzhanComments.STEP_3") comments = json.loads(params.content) for it in comments['data']['list']: content = it['content'] curtime = TimeUtility.getuniformtime(it['create_time']) nick = it['username'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def process(self, params): try: if params.step is None: # 拼接第一页评论url comments_url = PcautoComments.COMMENTS_URL % (params.originalurl, 1, PcautoComments.PAGE_SIZE) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, PcautoComments.STEP_2) #获取第一页评论内容,循环获取全部评论url elif params.step == PcautoComments.STEP_2: # 获取评论的Jason返回值 comments = json.loads(params.content) # 获取评论页数 comments_count = int(comments['total']) NewsStorage.setcmtnum(params.originalurl, comments_count) if comments_count == 0: return # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 循环拼接评论url,提交下载平台获取评论数据 for page in range(1, page_num + 1, 1): commentUrl = PcautoComments.COMMENTS_URL % (params.originalurl, page, PcautoComments.PAGE_SIZE) self.storeurl(commentUrl, params.originalurl, PcautoComments.STEP_3) #解析评论数据 elif params.step == PcautoComments.STEP_3: commentsinfo = json.loads(params.content) comments = [] for comment in commentsinfo['data']: updatetime = comment['createTime'] content = comment['content'] curtime = TimeUtility.getuniformtime(updatetime) try: nick = comment['nickName'] except: nick = 'anonymous' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) # if URLStorage.storeupdatetime(params.originalurl, updatetime): # cmti = CommentInfo() # cmti.content = comment['content'] # comments.append(cmti) # if len(comments) > 0: # self.commentstorage.store(params.originalurl, comments) except Exception, e: traceback.print_exc()
def geturlcomments(self, params): # 获取具体评论 try: jsondata = json.loads(params.content) if jsondata['data']: for comment in jsondata['data']: content = comment['content'] curtime = TimeUtility.getuniformtime(comment['createTime']) nick = comment['nickName'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except: Logger.printexception()
def getinfo(self, params): try: jsondata = json.loads(params.content) clicknum = jsondata['article']['readnum'] votenum = jsondata['article']['praisenum'] fansnum = jsondata['article']['favoritenum'] publishtime = TimeUtility.getuniformtime( jsondata['article']['publishtime']) title = jsondata['article']['title'] data = {} data = { "title": title, "clicknum": clicknum, "votenum": votenum, "fansnum": fansnum, "publishdate": publishtime } NewsStorage.seturlinfo(params.originalurl, '', '', data) except: Logger.printexception()
def getyueduurlcomment(self, proparam): try: commentsinfo = json.loads(proparam.content) for key in commentsinfo['data']: # cmti = CommentInfo() # 得到标准日期格式 #posttime = self.getdatetime(key['posttime'].strip()) #if posttime is None: curtime = TimeUtility.getuniformtime(key['posttime'].strip()) content = key['text'] nick = key['username'] if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) except: Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format(site=proparam.url))
def step2(self, params): try: soup = bs(params.content, 'html5lib') cmtsContainer = soup.find_all( attrs={'id': re.compile('readfloor_\d+')}) timelist = [] for cmtContainer in cmtsContainer: cmtContent = cmtContainer.find(attrs={ 'class': 'f14 mb10' }).get_text() cmtPubDate = cmtContainer.find(attrs={ 'class': 'tipTop s6' }).get_text() CMTStorage.storecmt(params.originalurl, cmtContent, cmtPubDate, '') timelist.append(TimeUtility.getuniformtime(cmtPubDate)) if not self.isnewesttime(params.originalurl, min(timelist)): return True return False except: Logger.printexception()
def step3tt(self, params): try: jsondata = json.loads(params.content) if jsondata: publishlist = [ TimeUtility.getcurrentdate(TimeUtility.DEFAULTFORMAT) ] try: if jsondata == "ERROR_PARAMETER": return entitylist = jsondata['resultDO'].get('entityList', []) for comment in entitylist: content = self.strfilter(comment['body']) #Jul 3, 2017 4:46:30 PM curtime = comment['replyTime'] #此处时间格式 curtime = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime(curtime, '%b %d, %Y %H:%M:%S %p')) nick = comment['userName'] publishlist.append(curtime) if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except: Logger.printexception() Logger.getlogging().error( 'extract no comment from {site}'.format( site=params.url)) if not self.isnewesttime(params.originalurl, min(publishlist)): return False return True except: Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format(site=params.url))
def step3(self, params): """""" try: jsondata = json.loads(params.content) comments = jsondata['comments'] except: Logger.getlogging().warning('{}:30000 No comments'.format( params.originalurl)) return cmts = [] for key in comments: try: nickname = comments[key]['user']['nickname'] except: nickname = 'anonymous' # 得到标准日期格式 curtime = TimeUtility.getuniformtime( str(comments[key]['createTime'])) content = comments[key]['content'] if not CMTStorage.exist(params.originalurl, content, curtime, nickname): CMTStorage.storecmt(params.originalurl, content, curtime, nickname)
def process(self, params): try: if self.r.search('^http://news.mtime.com/.*', params.originalurl): if params.step is MtimeComments.STEP_1: Logger.getlogging().info("MtimeComments.STEP_1") topic_id = self.r.parse('^http://news.mtime.com/\d+/\d+/\d+\/(\d+)\.html', params.originalurl)[0] # 1. 根据输入原始url, 拼出评论首页 commentinfo_url = MtimeComments.COMMENT_URL.format(topic_id=topic_id, page=0) self.storeurl(commentinfo_url, params.originalurl, MtimeComments.STEP_2, {'topic_id': topic_id}) elif params.step == MtimeComments.STEP_2: Logger.getlogging().info("MtimeComments.STEP_2") topic_id = params.customized['topic_id'] params.content = params.content.strip()[params.content.index('{'):params.content.index(';')] commentsinfo = json.loads(params.content) comments_count = commentsinfo['value']['totalCount'] if comments_count: NewsStorage.setcmtnum(params.originalurl, comments_count) cmtnum = CMTStorage.getcount(params.originalurl, True) # 判断增量 if cmtnum >= comments_count: return page_num = int( math.ceil(float(int(commentsinfo['value']['totalCount']) - cmtnum) / float(commentsinfo['value']['pageSize']))) if page_num >= self.maxpages: page_num = self.maxpages for index in range(1, page_num + 1, 1): commentinfo_url = MtimeComments.COMMENT_URL.format(topic_id=topic_id, page=index) self.storeurl(commentinfo_url, params.originalurl, MtimeComments.STEP_3) elif params.step == MtimeComments.STEP_3: Logger.getlogging().info("MtimeComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 params.content = params.content.strip()[params.content.index('{'):params.content.index(';')] commentsinfo = json.loads(params.content) # comments = [] # for index in range(0, int(len(commentsinfo['value']['comments'])), 1): # # 提取时间 # cmti = CommentInfo() # cmti.content = commentsinfo['value']['comments'][index]['content'] # tm = getuniformtime(commentsinfo['value']['comments'][index]['enterTime']) # if URLStorage.storeupdatetime(params.originalurl, tm): # comments.append(cmti) # # 保存获取的评论 # if len(comments) > 0: # self.commentstorage.store(params.originalurl, comments) for item in commentsinfo['value']['comments']: content = item['content'] curtime = TimeUtility.getuniformtime(item['enterTime']) nick = item['nickName'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) else: Logger.getlogging().error('proparam.step == {step}'.format(step = params.step)) return elif self.r.search('^http://people.mtime.com/.*', params.originalurl): if params.step is MtimeComments.STEP_1: Logger.getlogging().info("MtimeComments.STEP_1") cmtnum = CMTStorage.getcount(params.originalurl,True) if cmtnum: NewsStorage.setcmtnum(params.originalurl, cmtnum) docId = self.r.parse('^http://people.mtime.com/(\d+)/', params.originalurl)[0] # 1. 根据输入原始url, 拼出评论首页 commentinfo_url = MtimeComments.COMMENT_URL_PEOPLE1.format(docId=docId) self.storeurl(commentinfo_url, params.originalurl, MtimeComments.STEP_2, {'docId': docId}) elif params.step == MtimeComments.STEP_2: Logger.getlogging().info("MtimeComments.STEP_2") docId = params.customized['docId'] soup = BeautifulSoup(params.content, 'html5lib') page = soup.select('.num') self.storeurl(params.url, params.originalurl, MtimeComments.STEP_3) for index in range(2, len(page) + 2, 1): commentinfo_url = MtimeComments.COMMENT_URL_PEOPLE2.format(docId=docId, page=index) self.storeurl(commentinfo_url, params.originalurl, MtimeComments.STEP_3) elif params.step == MtimeComments.STEP_3: Logger.getlogging().info("MtimeComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 soup = BeautifulSoup(params.content, 'html5lib') comments = soup.select('div.mod_short') commentTimes = soup.select('span.fl') nicks = soup.select('p.px14') # commentInfo = [] for index in range(0, len(comments), 1): # 提取时间 # cmti = CommentInfo() # cmti.content = comments[index].get_text() # tm = getuniformtime(self.r.parse(u'entertime="(.+?)"', str(commentTimes[index + 1]))[0]) # if URLStorage.storeupdatetime(params.originalurl, tm): # commentInfo.append(cmti) # # 保存获取的评论 # if len(commentInfo) > 0: # self.commentstorage.store(params.originalurl, commentInfo) content = comments[index].get_text().strip().replace('\s','') curtime = TimeUtility.getuniformtime((self.r.parse(u'entertime="(.+?)"', str(commentTimes[index + 1]))[0])) nick = nicks[index].get_text().strip() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) else: Logger.getlogging().error('proparam.step == {step}'.format(step = params.step)) return except Exception,e: traceback.print_exc()
def process(self, params): try: if params.step is self.STEP_1: soup = BeautifulSoup(params.content, 'html5lib') body = soup.find(attrs={'class': 'post_message post_first'}) if body: NewsStorage.setbody(params.originalurl, body.get_text().strip()) else: Logger.getlogging().debug( '{url}:30000!'.format(url=params.originalurl)) keyvalue = params.url.split("/")[-1].split(".")[0] page = soup.select('.pager > a') if len(page) <= 2: page = 1 else: page = page[-2].get_text() page = int(re.findall('\d+', page)[0]) if self.pagelimit: if int(page) > self.pagelimit: Logger.getlogging().warning( 'the pageMaxNumber is shutdown to {0}'.format( self.pagelimit)) page = self.pagelimit for pg in range(1, int(page + 1)): comments_url = self.COMMENTS_URL % (keyvalue + '-' + str(pg)) self.storeurl(comments_url, params.originalurl, self.STEP_2, { 'page': pg, 'pagetotal': page }) elif params.step is self.STEP_2: #self.get_comments(params) page = params.customized['page'] soup = BeautifulSoup(params.content, 'html5lib') posts = soup.select('.post_wrap') if not posts: Logger.getlogging().debug( '{url}:30000!'.format(url=params.originalurl)) return for post in posts: post_msg = post.select_one('.post_message').get_text() post_msg = ''.join(post_msg.split()) # class ="user-42845238 post_time needonline " > 发表于 2017-07-27 23:53 post_time = post.find( attrs={ 'class': re.compile('user-.+post_time needonline') }).get_text() curtime = TimeUtility.getuniformtime(post_time) content = post_msg.strip() try: # class ="user-40693231 needonline" > Akemi隅晖 < / a > nick = post.find( attrs={ 'class': re.compile('user-.+ needonline') }).get_text() except: nick = 'nickname' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except: Logger.printexception()
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is rayliComments.STEP_1: # 取得html中的commentType articleId = re.findall( '^http://bbs\.rayli\.com\.cn/gallery-(\d+)-\d+.html', proparam.url).__getitem__(0) #取得评论url comments_url = rayliComments.COMMENTS_URL % (articleId, 1) self.storeurl(comments_url, proparam.originalurl, rayliComments.STEP_2, { 'articleId': articleId, }) elif proparam.step == rayliComments.STEP_2: articleId = proparam.customized['articleId'] # 取得评论个数 comments_count = float( re.findall(ur'回复:</span> (\d+)</div>', proparam.content).__getitem__(0)) if int(comments_count) == 0: return # 判断增量 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if cmtnum >= comments_count: return NewsStorage.setcmtnum(proparam.originalurl, comments_count) page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 循环取得评论的url for page in range(1, page_num + 1, 1): # 取得评论的url url = rayliComments.COMMENTS_URL % (articleId, page) self.storeurl(url, proparam.originalurl, rayliComments.STEP_3) elif proparam.step == rayliComments.STEP_3: commentsInfo = [] soup = BeautifulSoup(proparam.content, 'html.parser') # 获取评论 comments = soup.select('.t_f') # 获取评论时间 commentTime = self.r.parse( ur'<em id="authorposton\d+">发表于 (.+?)</em>', proparam.content) # 获取nick nicks = soup.select('.xw1') # 是否首页 page = int( self.r.parse(ur'page=1-page-(\d+)', proparam.url)[0]) if page == 1: index = 1 else: index = 0 publishlist = [ TimeUtility.getcurrentdate(TimeUtility.DEFAULTFORMAT) ] if len(comments) > 0: # 获取评论 for index in range(index, len(comments), 1): content = comments[index].text.strip() curtime = commentTime[index] nick = nicks[index].text publishlist.append(curtime) if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) # cmti = CommentInfo() # if URLStorage.storeupdatetime(proparam.originalurl, commentTime[index]): # cmti.content = comments[index].text # commentsInfo.append(cmti) if len(publishlist) > 0: publishdate = min(publishlist) NewsStorage.setpublishdate(proparam.originalurl, publishdate) # # 保存获取的评论 # if len(commentsInfo) > 0: # self.commentstorage.store(proparam.originalurl, commentsInfo) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()