def setclick(self, params): try: content = json.loads(params.content) # content=[播放量,评论,X,X,弹幕,收藏数,投焦数,X] cmtnum = content[1] clicknum = content[0] votenum = content[-2] fansnum = content[-3] if not cmtnum: cmtnum = 0 if not clicknum: clicknum = 0 if not votenum: votenum = 0 if not fansnum: fansnum = 0 NewsStorage.seturlinfo(params.originalurl, data={ SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: cmtnum, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: clicknum, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM: votenum, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM: fansnum }) except: Logger.printexception()
def step1(self, params): try: Logger.getlogging().info("Kr36Comments.STEP_1") cid = self.r.parse('^http://36kr.com/p/(\d+)\.html', params.originalurl)[0] content = params.content page_content = content.split('<script>var props={"detailArticle|post":')[1].split(',"abTest|abtest":')[0] dump_content = eval(json.dumps(page_content)) json_content = json.loads(dump_content) info_title = json_content["title"] info_content = json_content["content"] info_pubtime = TimeUtility.getformattime(json_content["published_at"]) info_clicknum = json_content["counters"]["view_count"] info_cmtnum = json_content["counters"]["comment"] info_fansnum = json_content["counters"]["favorite"] info_votenum = json_content["counters"]["like"] # 去除HTML标签 info_content = re.compile('</?\w+[^>]*>').sub('',info_content) if info_title: title = info_title # NewsStorage.settitle(params.originalurl,info_title) if info_content: body = info_content # NewsStorage.setbody(params.originalurl,info_content) if info_clicknum: clicknum = info_clicknum # NewsStorage.setclicknum(params.originalurl, info_clicknum) if info_pubtime: publishdate = info_pubtime # NewsStorage.setpublishdate(params.originalurl, info_pubtime) if info_cmtnum: cmtnum = info_cmtnum # NewsStorage.setcmtnum(params.originalurl, info_cmtnum) if info_fansnum: fansnum = info_fansnum # NewsStorage.setfansnum(params.originalurl, info_fansnum) if info_votenum: votenum = info_votenum # NewsStorage.setvotenum(params.originalurl, info_votenum) data = {"title": title, "clicknum": clicknum, "votenum": votenum, "fansnum": fansnum, "publishdate": publishdate,"body":body,"cmtnum":cmtnum} NewsStorage.seturlinfo(params.originalurl,"","" ,data) # 根据输入原始url, 拼出评论首页 commentinfo_url = Kr36Comments.COMMENT_URL.format(cid, self.page_size, 1) self.storeurl(commentinfo_url, params.originalurl, Kr36Comments.STEP_2,{'cid':cid}) except: Logger.printexception()
def getinfo(self, params): try: jsondata = json.loads(params.content) clicknum = jsondata['article']['readnum'] votenum = jsondata['article']['praisenum'] fansnum = jsondata['article']['favoritenum'] publishtime = TimeUtility.getuniformtime( jsondata['article']['publishtime']) title = jsondata['article']['title'] data = {} data = { "title": title, "clicknum": clicknum, "votenum": votenum, "fansnum": fansnum, "publishdate": publishtime } NewsStorage.seturlinfo(params.originalurl, '', '', data) except: Logger.printexception()
def wb_updatedb(self): self.conf.setchannel(constant.SPIDER_CHANNEL_S2) for query in URLManager.waibustorage: self.conf.setquery(query) paramslist = URLManager.waibustorage[query] for params in paramslist: if params.type == constant.SPIDER_S2_WEBSITE_VIDEO: data = { SQLDAO.SPIDER_TABLE_NEWS_TITLE: params.title, SQLDAO.SPIDER_TABLE_NEWS_BODY: params.body, SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE: params.pubtime, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: params.clicknum } NewsStorage.seturlinfo(params.url, data=data) else: data = { SQLDAO.SPIDER_TABLE_NEWS_TITLE: params.title, SQLDAO.SPIDER_TABLE_NEWS_BODY: params.body, SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE: params.pubtime } NewsStorage.seturlinfo(params.url, data=data)