def seturlinfos(params): id = NewsStorage.getid(params.url) if NewsStorage.exist(params.url): doc = NewsStorage.getdoc(params.url) data = {} #data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter( params.title) if params.type != constant.SPIDER_S2_WEBSITE_VIDEO: data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter( params.body) if doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, TimeUtility.getintformtime( 0)) == TimeUtility.getintformtime(0): data[ SQLDAO. SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime( params.pubtime) data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id}, data) else: data = {} data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter( params.title) if params.type != constant.SPIDER_S2_WEBSITE_VIDEO: data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter( params.body) data[SQLDAO. SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime( params.pubtime) data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id data[SQLDAO.SPIDER_TABLE_NEWS_URL] = params.url data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = params.query data[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL] = params.channel data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] = params.createtime data[SQLDAO. SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS, SQLDAO.SPIDER_TABLE_NEWS_KEYS, SQLDAO.getvaluesfromkeys(data))
def updatedb(self): #此处注释请勿删除 #wheref = '{key1}={val1} and \ #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) > {secs}) or \ #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) > {secs}))' #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG, #time0='\"'+TimeUtility.getuniformtime(0)+'\"', #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"', #secs =self.period * 24*60*60 #) where = { SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG } results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) colddata = [] for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) try: publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE] createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate > self.period * 24*60*60) or \ (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) > self.period * 24*60*60): id = data[SQLDAO.SPIDER_TABLE_NEWS_ID] colddata.append(result) SQLDAO.getinstance().delete( SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id}) except: Logger.printexception() Logger.log(data[SQLDAO.SPIDER_TABLE_NEWS_URL], constant.ERRORCODE_WARNNING_OTHERS) if colddata: SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS_COLD, SQLDAO.SPIDER_TABLE_NEWS_KEYS, colddata, mutli=True)
def seturlinfo(url, key=None, value=None, data={}): id = NewsStorage.getid(url) if data: SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id}, data) return if SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE == key: value = TimeUtility.getuniformtime(value) if NewsStorage.exist(url): doc = NewsStorage.getdoc(url) tempvalue = doc.get(key, '') if tempvalue != value: data = { key: value, SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE: SQLDAO.gettime() } SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id}, data) else: data = {} data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id data[SQLDAO.SPIDER_TABLE_NEWS_URL] = url data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = SpiderConfigure.getinstance( ).getquery() data[SQLDAO. SPIDER_TABLE_NEWS_CHANNEL] = SpiderConfigure.getinstance( ).getchannel() data[SQLDAO. SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG data[SQLDAO. SPIDER_TABLE_NEWS_CREATE_DATE] = SpiderConfigure.getinstance( ).starttime() data[key] = value data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS, SQLDAO.SPIDER_TABLE_NEWS_KEYS, SQLDAO.getvaluesfromkeys(data))
def getcount(url, before=False): if not before: return SQLDAO.getinstance().count( SQLDAO.SPIDER_TABLE_COMMENTS, {SQLDAO.SPIDER_TABLE_COMMENTS_URL: url}) else: wheref = '{urlkey}=\"{url}\" and {datekey}<={date}' where = wheref.format( urlkey=SQLDAO.SPIDER_TABLE_COMMENTS_URL, url=url, datekey=SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE, date=SQLDAO.gettime()) return SQLDAO.getinstance().count(SQLDAO.SPIDER_TABLE_COMMENTS, where=where)
def storeurl(url): id = NewsStorage.getid(url) if not NewsStorage.exist(url): data = {} data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id data[SQLDAO.SPIDER_TABLE_NEWS_URL] = url data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = SpiderConfigure.getinstance( ).getquery() data[SQLDAO. SPIDER_TABLE_NEWS_CHANNEL] = SpiderConfigure.getinstance( ).getchannel() data[SQLDAO. SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG data[SQLDAO. SPIDER_TABLE_NEWS_CREATE_DATE] = SpiderConfigure.getinstance( ).starttime() data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS, SQLDAO.SPIDER_TABLE_NEWS_KEYS, SQLDAO.getvaluesfromkeys(data))
def dumpurls(self): #dump本台机器query对应的urllsit, 并存储到对应的文件中 s2file = SpiderConfigure.getinstance().gets2file() s2temppath = Storage.getstoragelocation( const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file) #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG) querys = QueryStorage.getinstance().getlocalquerys( s2temppath, ETLController.LOCALMACHINEFLAG) for query in querys: Logger.getlogging().debug( 'Now, Starting Select url to Insert and Update for uploading location urlfile!' ) self.conf.setchannel(constant.SPIDER_CHANNEL_S2) self.conf.setquery(query) #此处注释请勿删除 #1.转换周期内数据 # 1.1pulishdate存在,时间为最近一周 # 2.1publistdate为0,使用创建时间,时间为最近一周 #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \ #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \ #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))' #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG, #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"', #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #starttime = SpiderConfigure.getinstance().starttime(), #time0='\"'+TimeUtility.getuniformtime(0)+'\"', #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"', #secs =self.period * 24*60*60 #) where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG } Logger.getlogging().debug( 'Query condition: {where}'.format(where=str(where))) results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urltemplist = [] for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE] createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \ (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60): if url not in urltemplist: urltemplist.append(url) params = PageBasicInfo() params.url = url NewsStorage.seturlinfos(params) #2.抽取createdate为本次开始时间的数据 URLFileManager.getinstance().generateurlfilepath() where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG, SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE: SpiderConfigure.getinstance().starttime() } results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urllist = [] linecount = 0 for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() urllist.append(url) context = URLContext() context.originalurl = url context.type = URLContext.S1_MAIN_BODY context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[ SQLDAO.SPIDER_TABLE_NEWS_TYPE] Logger.getlogging().debug(url) URLManager.getinstance().storeurl(url, context, REQUEST_TYPE_WEBKIT) linecount += 1