def getclick(self, params): pattern = 'https?://\w+\.le\.com.*/\w+/(\d+)\.html' if re.search(pattern, params.originalurl): if self.r.search(pattern, params.originalurl): vid = self.r.parse(pattern, params.originalurl)[0] playcount_url = self.PALYCOUNT_URL.format(vid=vid) self.storeurl(playcount_url, params.originalurl, LeComments.STEP_PALY) if NewsStorage.getpublishdate( params.originalurl) == TimeUtility.getintformtime(0): if self.r.search('https?://sports\.le\.com/video/\d+\.html', params.originalurl): #仅针对体育频道获取发布时间 pubTime = XPathUtility( params.content).getstring('//*[@class="live-vedio-infor"]') publishdate = TimeUtility.getuniformtime(publishdate) NewsStorage.setpublishdate(params.originalurl, publishdate) else: #仅针对综艺频道获取发布时间 title = XPathUtility(params.content).getstring( '//h1[@class="j-video-name video-name"]') if title: if re.search('\d{8}', title): publishdate = re.findall('\d{8}', title)[0] NewsStorage.setpublishdate(params.originalurl, publishdate)
def step1(self, params): Logger.getlogging().info("ToutiaoNewsComments.STEP_1") group_id = self.r.parse('http://www.toutiao.com/(\w+)/.*',params.originalurl)[0] if group_id: group_id = group_id[1:] # group_id = self.r.getid("groupId", params.content) # item_id = self.r.getid("itemId", params.content) # if not group_id: # group_id = self.r.getid("group_id", params.content) # item_id = self.r.getid("item_id", params.content) try: publishdate = self.r.getid("time", params.content) if not publishdate: publishdate = self.r.getid("publish_time", params.content) if publishdate: NewsStorage.setpublishdate(params.originalurl,publishdate) except: Logger.getlogging().error('{0}:30000'.format(params.originalurl)) if not group_id : Logger.getlogging().error('{0}:30000'.format(params.originalurl)) return # commentinfo_url = ToutiaoNewsComments.COMMENTS_URL.format(group_id, item_id, 1, self.page_size) commentinfo_url = ToutiaoNewsComments.COMMENTS_URL.format(group_id, 0, self.page_size) self.storeurl(commentinfo_url, params.originalurl, ToutiaoNewsComments.STEP_2,{'group_id':group_id})
def get_url_id(self, params): """只适用在腾讯视频的部分""" "cid是电视剧\合集\电影,vid单集" CID_PATTERN = 'https?://v\.qq\.com/x/cover/(\w+).html' CID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&cid={cid}' VID_PATTERN1 = 'https?://v\.qq\.com/x/cover/\w+/(\w+).html' VID_PATTERN2 = 'https?://v\.qq\.com/x/page/(\w+)\.html' VID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&vid={vid}' if self.r.search(CID_PATTERN, params.originalurl): cid = self.r.parse(CID_PATTERN, params.originalurl)[0] url = CID_URL.format(cid=cid) self.storeurl(url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) elif self.r.search(VID_PATTERN1, params.originalurl): vid = self.r.parse(VID_PATTERN1, params.originalurl)[0] url = VID_URL.format(vid=vid) self.storeurl(url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) elif self.r.search(VID_PATTERN2, params.originalurl): vid = self.r.parse(VID_PATTERN2, params.originalurl)[0] url = VID_URL.format(vid=vid) self.storeurl(url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) #publish_date publish_date = self.r.getid('publish_date', params.content, split=':') if not publish_date: publish_date = XPathUtility(params.content).getstring( '//*[@class="video_tags"]/span|//*[@class="date"]|//*[@class="tag_item"]' ) publish_date = TimeUtility.getuniformtime(publish_date) if publish_date: NewsStorage.setpublishdate(params.originalurl, publish_date) self.setclick(params)
def setclicknum(self,params): try: jsondate = json.loads(params.content) todayplaynum = jsondate['cmtVote'] publishdate = jsondate['createTime'] NewsStorage.setclicknum(params.originalurl, todayplaynum) NewsStorage.setpublishdate(params.originalurl, TimeUtility.getuniformtime(publishdate)) except: Logger.printexception()
def step_click(self, params): sid = params.customized['sid'] infodata = json.loads(params.content) for info in infodata: if info['id'] == str(sid): addtime = TimeUtility.getuniformtime(info['adddate']) playcount = self.str2num(info['playtimes']) NewsStorage.setclicknum(params.originalurl, playcount) NewsStorage.setpublishdate(params.originalurl, addtime) break
def process(self, params): try: if params.step is ZhihuComments.STEP_1: self.step2(params) publish = self.r.getid('itemprop="dateCreated" content', params.content, split='=') if publish: NewsStorage.setpublishdate(params.originalurl, publish) #if params.step is ZhihuComments.STEP_2: #self.step2(params) except: Logger.printexception()
def step3(self, params): page = params.customized['page'] startpage = params.customized['startpage'] comments = json.loads(params.content) commentsInfo = [] pubdatelist = [] for comment in comments['data']['page_list']: try: pubdate = comment['create_time'] pubdatelist.append(pubdate) content = comment['content'] CMTStorage.storecmt(params.originalurl, content, pubdate, '') except: Logger.printexception() if page == startpage: if pubdatelist: NewsStorage.setpublishdate(params.originalurl, max(pubdatelist))
def setpubtime(self, params): newtime = None if re.search('http://chanye\.18183\.com/.*', params.url): Xhtml = XPathUtility(params.content) timestr = Xhtml.getstring( '//*[@class="arc-other"]/span[3]|//*[@class="other"]/span[3]') if not timestr: return p = '(\d{2}-\d+-\d+)' if re.search(p, timestr): new = str(time.localtime()[0])[0:2] + re.findall(p, timestr)[0] newtime = getuniformtime(new) #if re.search('http://bbs\.18183\.com/.*',params.url): #Xhtml = XPathUtility(params.content) #timestr = Xhtml.getstring('//*[@class="authi"]/em') #if not timestr: #return #times = timestr.split(u'发表于')[1] #newtime = TimeUtility.getuniformtime(times) if newtime: NewsStorage.setpublishdate(params.originalurl, newtime)
def step2(self, params): newsid = params.customized['newsid'] channel = params.customized['channel'] group = params.customized['group'] comments = json.loads(params.content) if not self.isvalid(comments): Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl)) return # 获取视频的publishdate if self.r.search('http[s]{0,1}://.*video\.sina\.com.*', params.originalurl): publishdate = comments['result']['news']['time'] NewsStorage.setpublishdate(params.originalurl, TimeUtility.getuniformtime(publishdate)) # 获取新闻的clicknum elif self.r.search('http[s]{0,1}://.*\.sina\.com.*', params.originalurl): if NewsStorage.getclicknum(params.originalurl) <= 0: try: news_clicknum = comments['result']['count']['total'] NewsStorage.setclicknum(params.originalurl, news_clicknum) except: Logger.printexception() comments_count = int(comments['result']['count']['show']) #设置cmtnum NewsStorage.setcmtnum(params.originalurl, comments_count) cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl)) return pages = int(math.ceil(float(comments_count - cmtnum) / self.DEFAULT_PAGE_SIE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): if page == 1: self.step3(params) continue url = CommonComments.SINA_COMMENTS_URL.format(channel=channel, newsid=newsid, pn=page, ps=SinaComments.DEFAULT_PAGE_SIE) if group: url = url + '&group=' + group self.storeurl(url, params.originalurl, SinaComments.STEP_COMMENT_NEXT_PAGE)
def step1(self, params): # 获取发布时间,正文 pubtime = '' soup = BeautifulSoup(params.content, 'html5lib') pubTimes = self.r.getid('data-qitancomment-tvyear', params.content) if pubTimes: NewsStorage.setpublishdate(params.url, TimeUtility.getuniformtime(pubTimes)) #1. 下载html,使用正则表达式获取data-qitancomment-qitanid字段 qitanid = self.r.getid('data-qitancomment-qitanid', params.content) tvid = self.r.getid('data-qitancomment-tvid', params.content) playcounturl = self.PLAYCOUNT_URL.format(tvid=tvid) self.storeurl(playcounturl, params.originalurl, IqiyiComments.STEP_PLAYCOUNT, {'tvid': tvid}) #2. 判断qitanid的值 if qitanid and int(qitanid): # 2.1 qitanid不为0 #2.1.1使用如下URL获取评论量 comments_url = 'http://api.t.iqiyi.com/qx_api/comment/get_video_comments?page=1&page_size=1&qitanid={qitanid_value}&sort=add_time&need_total=1&tvid={tvid_value}'.format( qitanid_value=qitanid, tvid_value=tvid) self.storeurl(comments_url, params.originalurl, IqiyiComments.STEP_2, { 'qitanid': qitanid, 'tvid': tvid }) else: #2.2 如果data-qitancomment-qitanid=0 #2.2.1. 下载html,使用正则表达式获取data-qitancomment-tvid字段 comments_url = 'http://api.t.iqiyi.com/qx_api/comment/get_video_comments?page=1&page_size=1&sort=add_time&tvid={tvid_value}&need_total=1'.format( tvid_value=tvid) self.storeurl(comments_url, params.originalurl, IqiyiComments.STEP_2, { 'qitanid': '0', 'tvid': tvid })
def step1(self,params): """""" #print params.content try: website = re.findall('http://(.*?)/',params.originalurl)[0] # re.search('^http://[(bbs)|(gz)|(moba)].*/\w+-\d+-\d+-\d.*',params.originalurl): if re.search('^http://[(bbs)|(gz)|(moba)|(gxdmw)].*/\w+-\d+-\d+-\d.*',params.originalurl): area = re.findall('com/(\w+?)-',params.originalurl)[0] url_id = re.findall('\d+',params.originalurl)[-3] elif re.search('^http://[(www)|(xsbbs)|(bbs)|(moba)].*/forum\.php\?mod=\w+(&fid=\d+)?&tid=\d+',params.originalurl): area = re.findall('mod=(\w+?)&',params.originalurl)[0] url_id = re.findall('tid=(\d+)',params.originalurl)[0] else: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE) return soup = BeautifulSoup(params.content,'html5lib') #主贴内容、时间、查看数、回复数,页面数 main_content = soup.find(attrs={'id':re.compile(self.commentCsskey['content_idkey'])}) if main_content: main_content = main_content.get_text() NewsStorage.setbody(params.originalurl, main_content) else: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_XPATHVALUE) return curtimeobj = soup.find(attrs={'id':re.compile(self.commentCsskey['time_idkey'])}) if curtimeobj: if curtimeobj.select_one('span'): curtime = curtimeobj.select_one('span').get('title') else: curtime = curtimeobj.get_text() NewsStorage.setpublishdate(params.originalurl, TimeUtility.getuniformtime(curtime)) if re.search('^http://www\.gxdmw\.com/.*',params.originalurl): #只是针对http://www.gxdmw.com/网站 cmtnum = soup.find(attrs={'class':"vwthdreplies y"}) curcmtnum = cmtnum.select_one('strong').get_text() else: cmtnum = soup.select(self.cmt_page_numCSS['cmtnumCss']) cmt_read = cmtnum[0].get_text() curcmtnum = cmtnum[1].get_text() curcmtnum = int(curcmtnum) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return #直接取 pageObj = soup.select(self.cmt_page_numCSS4['pageCss']) if pageObj: Logger.getlogging().debug('first get pageObj:%s'% pageObj[-2].get_text().strip('.')) if not pageObj: pageObj = soup.select(self.cmt_page_numCSS['pageCss']) if not pageObj: pageObj = soup.select(self.cmt_page_numCSS2['pageCss']) if not pageObj: pageObj = soup.select(self.cmt_page_numCSS3['pageCss']) if pageObj: page_num = pageObj[-2].get_text().strip('.') else: page_num = 1 #此部分只能具体网站可做具体的页面数量限制 if re.search('^http://bbs\.(17173|17k|gamersky)\.com/.*', params.originalurl): page_size = self.page_size2 elif re.search('^http://bbs\.78dm\.net/.*', params.originalurl): page_size = self.page_size3 else: page_size = self.page_size start = int(dbcmtnum / page_size) + 1 end = int(page_num) if end > start+ self.maxpages: start = end - self.maxpages params.customized['page'] = 1 if end == 1: self.step2(params) return if start == 1: self.step2(params) #获取最后一页 if re.search('^http://[(bbs)|(gz)|(moba)|(gxdmw)].*/\w+-\d+-\d+-\d.*',params.originalurl): url = self.COMMENTS_URL.format(website=website,area=area,url_id=url_id,page=end) if re.search('^http://[(www)|(xsbbs)|(bbs)|(moba)].*/forum\.php\?mod=\w+(&fid=\d+)?&tid=\d+',params.originalurl): url = self.FORUM_URL.format(website=website,area=area,url_id=url_id,page=end) if url: self.storeurl(url, params.originalurl, self.STEP_1_2,{'page':end, 'start':start, 'end':end, 'website':website, 'area':area, 'url_id':url_id}) #for page in range(end, start-1, -1): ##if int(page) == end: ##params.customized['page'] = 1 ##if not self.step2(params): ##break ##continue #if re.search('^http://[(bbs)|(gz)|(moba)|(gxdmw)].*/\w+-\d+-\d+-\d.*',params.originalurl): #url = self.COMMENTS_URL.format(website=website,area=area,url_id=url_id,page=page) #if re.search('^http://[(www)|(xsbbs)|(bbs)|(moba)].*/forum\.php\?mod=\w+(&fid=\d+)?&tid=\d+',params.originalurl): #url = self.FORUM_URL.format(website=website,area=area,url_id=url_id,page=page) #if url: #self.storeurl(url, params.originalurl, self.STEP_COMMENT_EACH_PAGE,{'page':page}) except: Logger.printexception() Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
def getpagecomments_step2(self, params): try: page = params.customized['page'] soup = BeautifulSoup(params.content, "html5lib") d_post_content_main = soup.select('#j_p_postlist > div.j_l_post') if page == 1: main_item = d_post_content_main[0] #print main_item pubtimes = '' pubtimesobj = main_item.select('.tail-info') if pubtimesobj: pubtimes = getuniformtime( pubtimesobj[-1].get_text().strip()) else: pubtimeslist = re.findall('\d+-\d+-\d+ \d+:\d+', str(main_item)) if pubtimeslist: pubtimes = getuniformtime(pubtimeslist[0]) if pubtimes: NewsStorage.setpublishdate(params.originalurl, pubtimes) if not compareNow(pubtimes, self.COMMENT_LIMIT_DAYS): Logger.log(params.originalurl, constant.ERRORCODE_WARNNING_NOMATCHTIME) #超过7天的帖子,不在取回复/评论了 return False d_post_content_main = d_post_content_main[1:] comments = [] for item in d_post_content_main: try: comment = item.find( attrs={'id': re.compile("post_content")}) if not comment: continue content = comment.get_text().strip() pubtimes = '' pubtimesobj = item.select('.tail-info') if pubtimesobj: pubtimes = getuniformtime( pubtimesobj[-1].get_text().strip()) else: pubtimeslist = re.findall('\d+-\d+-\d+ \d+:\d+', str(item)) if pubtimeslist: pubtimes = getuniformtime(pubtimeslist[0]) if not pubtimes: if not CMTStorage.exist(params.originalurl, content, TimeUtility.getdatebefore(0), 'nick'): CMTStorage.storecmt(params.originalurl, content, TimeUtility.getdatebefore(0), 'nick') continue #判断评论是否是前一天的 Logger.getlogging().debug(pubtimes) if self.isyestoday(pubtimes): if not CMTStorage.exist(params.originalurl, content, pubtimes, 'nick'): CMTStorage.storecmt(params.originalurl, content, pubtimes, 'nick') except: Logger.printexception() return True except: Logger.printexception() return False
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is poocgNewsComments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url参数。 articleId = self.r.parse('^http[s]?://www\.poocg\.com/works/view/(\d+)', params.originalurl)[0] # 取得总件数 comment_count = float(self.r.parse(ur'<p><strong>(\d+)</strong><span>评论</span></p>', params.content)[0]) NewsStorage.setcmtnum(params.originalurl, int(comment_count)) if comment_count == 0: return # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comment_count: return # 获取页数 page_num = int(math.ceil(float(comment_count - cmtnum) / poocgNewsComments.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 获得url列表 for page in range(1, page_num + 1, 1): url = poocgNewsComments.COMMENT_URL % (articleId, page) self.storeurl(url, params.originalurl, poocgNewsComments.STEP_3) if NewsStorage.getclicknum(params.originalurl) <= 0: clicknum = int(self.r.parse(ur'<p><strong>(\d+)</strong><span>浏览</span></p>', params.content)[0]) NewsStorage.setpublishdate(params.originalurl, clicknum) if NewsStorage.getfansnum(params.originalurl) <= 0: fansnum = int(self.r.parse(ur'<p><strong>(\d+)</strong><span>喜欢</span></p>', params.content)[0]) NewsStorage.setpublishdate(params.originalurl, fansnum) publishdate = str(self.r.parse(ur'<p.*class="signed">(.*?)</p>', params.content)[0]) NewsStorage.setpublishdate(params.originalurl, TimeUtility.getuniformtime(publishdate)) elif params.step == poocgNewsComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 Logger.getlogging().info("params.step == 3") xparser = XPathUtility(params.content) # 取得所有评论 soup = BeautifulSoup(params.content, 'html.parser') comments = soup.select('.p2') nicks = soup.select('.name') # 取得所有评论时间 times = soup.select('.contentbox .time') commentsInfo = [] # 取得所有评论 for index in range(0, int(len(comments)), 1): # 提取时间 # year = TimeUtility.getcurrentdate()[0:4] # publictime= year + '年' + commenttimes[index].text try: if len(times)>0: publictime = times[index].get_text() curtime = TimeUtility.getuniformtime(publictime) else: curtime = '' except: curtime ='' content = comments[index].get_text() try: nick = str(nicks[index].get_text()) except: nick = 'nickname' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) # if URLStorage.storeupdatetime(params.originalurl, tm): # cmti = CommentInfo() # cmti.content = comments[index].get_text() # commentsInfo.append(cmti) # # # 保存获取的评论 # if len(commentsInfo) > 0: # self.commentstorage.store(params.originalurl, commentsInfo) else: Logger.getlogging().error('proparam.step == {step}'.format(step = params.step)) except Exception,e: traceback.print_exc()
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is rayliComments.STEP_1: # 取得html中的commentType articleId = re.findall( '^http://bbs\.rayli\.com\.cn/gallery-(\d+)-\d+.html', proparam.url).__getitem__(0) #取得评论url comments_url = rayliComments.COMMENTS_URL % (articleId, 1) self.storeurl(comments_url, proparam.originalurl, rayliComments.STEP_2, { 'articleId': articleId, }) elif proparam.step == rayliComments.STEP_2: articleId = proparam.customized['articleId'] # 取得评论个数 comments_count = float( re.findall(ur'回复:</span> (\d+)</div>', proparam.content).__getitem__(0)) if int(comments_count) == 0: return # 判断增量 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if cmtnum >= comments_count: return NewsStorage.setcmtnum(proparam.originalurl, comments_count) page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 循环取得评论的url for page in range(1, page_num + 1, 1): # 取得评论的url url = rayliComments.COMMENTS_URL % (articleId, page) self.storeurl(url, proparam.originalurl, rayliComments.STEP_3) elif proparam.step == rayliComments.STEP_3: commentsInfo = [] soup = BeautifulSoup(proparam.content, 'html.parser') # 获取评论 comments = soup.select('.t_f') # 获取评论时间 commentTime = self.r.parse( ur'<em id="authorposton\d+">发表于 (.+?)</em>', proparam.content) # 获取nick nicks = soup.select('.xw1') # 是否首页 page = int( self.r.parse(ur'page=1-page-(\d+)', proparam.url)[0]) if page == 1: index = 1 else: index = 0 publishlist = [ TimeUtility.getcurrentdate(TimeUtility.DEFAULTFORMAT) ] if len(comments) > 0: # 获取评论 for index in range(index, len(comments), 1): content = comments[index].text.strip() curtime = commentTime[index] nick = nicks[index].text publishlist.append(curtime) if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) # cmti = CommentInfo() # if URLStorage.storeupdatetime(proparam.originalurl, commentTime[index]): # cmti.content = comments[index].text # commentsInfo.append(cmti) if len(publishlist) > 0: publishdate = min(publishlist) NewsStorage.setpublishdate(proparam.originalurl, publishdate) # # 保存获取的评论 # if len(commentsInfo) > 0: # self.commentstorage.store(proparam.originalurl, commentsInfo) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()
def step1(self, params): """获取评论的首页url""" try: comment_source_url = '' if self.r.search('http[s]{0,1}://.*tv\.sohu\.com.*', params.originalurl): #对于电影,电视剧,搜狐手游取topic_source_id页面字段来源不同 if self.r.search('^http://tv\.sohu\.com/\d{8}/n\d+\.shtml', params.originalurl): topic_source_id = self.r.parse( 'var[\s]*vid[\s]*=[\s]*\"(.+?)\"', params.content) if topic_source_id: topic_source_id = topic_source_id[0] else: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return elif self.r.search('^http://my\.tv\.sohu\.com/.*.shtml', params.originalurl): topic_source_id = self.r.parse('\d{1,}', params.originalurl)[-1] topic_source_id = 'bk' + topic_source_id else: topic_source_id = self.r.getid('PLAYLIST_ID', params.content) if not topic_source_id: topic_source_id = self.r.getid('playlistId', params.content) if not topic_source_id: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return topic_source_id = 'vp' + topic_source_id comment_source_url = self.TV_COMMENTS_SOURCE_URL.format( self.tv_client_id, params.originalurl, topic_source_id, self.tv_page_size) else: if self.r.parse('group', params.originalurl): topic_source_id = \ self.r.parse('http[s]{0,1}://.*\.sohu\.com/group-(\d+)\.shtml.*', params.originalurl)[0] comment_source_url = self.COMMENTS_SOURCE_URL.format( self.client_id, self.group_mark + topic_source_id, self.page_size) else: topic_source_id = \ self.r.parse('http[s]{0,1}://.*\.sohu\.com/\d{8}/n(\d+)\.shtml.*', params.originalurl)[0] comment_source_url = self.COMMENTS_SOURCE_URL.format( self.client_id, topic_source_id, self.page_size) self.storeurl(comment_source_url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) #http://tv.sohu.com/20170831/n600133376.shtml #http://tv.sohu.com/s2015/newslist/?vid=4016103 暂无法取得 #对播放量进行检查,如果xpath没有获取到,使用代码通过api获取 if NewsStorage.getclicknum(params.originalurl) <= 0: if re.search('^http://tv\.sohu\.com/\d{8}/n\d+\.shtml', params.originalurl): vid = self.r.getid('vid', params.content, split='=') clickurl = self.TVCLICKURL.format(vid=vid) self.storeurl(clickurl, params.originalurl, self.STEP_TVCLICK) elif re.search('^http://tv\.sohu\.com/.*vid=(\d+)', params.originalurl): vid = self.r.parse('^http://tv\.sohu\.com/.*vid=(\d+)', params.originalurl)[0] clickurl = self.TVCLICKURL.format(vid=vid) self.storeurl(clickurl, params.originalurl, self.STEP_TVCLICK) elif re.search('^http[s]{0,1}://my\.tv\.sohu\.com.*\.shtml$', params.originalurl): clickurl = self.MYTVCLICKURL.format( vid=params.originalurl.split('/')[-1].split('.')[0]) self.storeurl(clickurl, params.originalurl, self.STEP_MYTVCLICK) if re.search('^http[s]{0,1}://my\.tv\.sohu\.com.*\.shtml$', params.originalurl): if not params.content: Logger.getlogging().debug("no params.content") if not self.r.search('uploadTime: \'(.*)?\'', params.content): Logger.getlogging().debug("no params.content uploadTime") if self.r.search('uploadTime: \'(.*)?\'', params.content): publishdate = self.r.parse('uploadTime: \'(.*)?\'', params.content)[0] NewsStorage.setpublishdate( params.originalurl, TimeUtility.getuniformtime(publishdate)) except: Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format(site=params.url))