def storecmt(url, content, pubdate, user): content = Common.strfilter(content) user = Common.strfilter(user) pubdate = TimeUtility.getuniformtime(pubdate) if not CMTStorage.exist(url, content, pubdate, user): Logger.getlogging().debug( 'url:{url}, content:{content}, pubdate:{pubdate}, user:{user}'. format(url=url, content=content, pubdate=pubdate, user=user)) id = CMTStorage.getid(url, content, pubdate, user) data = { SQLDAO.SPIDER_TABLE_COMMENTS_ID: id, SQLDAO.SPIDER_TABLE_COMMENTS_URL: url, SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE: pubdate, SQLDAO.SPIDER_TABLE_COMMENTS_USER: user, SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT: content, SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE: SpiderConfigure.getinstance().starttime() } SQLDAO.getinstance().insert( SQLDAO.SPIDER_TABLE_COMMENTS, SQLDAO.SPIDER_TABLE_COMMENTS_KEYS, SQLDAO.getvaluesfromkeys(data, SQLDAO.SPIDER_TABLE_COMMENTS_KEYS))
def show(): u'{channel}\t{query}\t{cmtnum}\t{clicknum}\t{fansnum}\t{votenum}\t{publishdate}\t{createdate}\t{url}' Logger.getlogging().debug( 'Now, Results Extract From Database Showing: ') Logger.getlogging().debug( u'channel\tquery\tcmtnum\tclicknum\tfansnum\tvotenum\tpublishdate\tcreatedate\turl' ) alldata = SQLDAO.getinstance().find( SQLDAO.SPIDER_TABLE_NEWS, { SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE: SpiderConfigure.getinstance().starttime() }) for data in alldata: dictdata = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, data) string = NewsStorage.NEWS_FORMAT.format( channel=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL], query=dictdata[SQLDAO.SPIDER_TABLE_NEWS_QUERY], cmtnum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM], clicknum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM], fansnum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM], votenum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM], publishdate=dictdata[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE], createdate=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE], url=dictdata[SQLDAO.SPIDER_TABLE_NEWS_URL]) Logger.getlogging().debug(string)
def updatecommentsflag(self, urllist): if urllist: urllist = [item.encode(constant.CHARSET_UTF8) for item in urllist] sqlf = 'UPDATE {table} SET {key1}=1 WHERE {key1} is null and {url} in {urllist}' sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_COMMENTS, key1=SQLDAO.SPIDER_TABLE_COMMENTS_KEY1, url=SQLDAO.SPIDER_TABLE_COMMENTS_URL, urllist=tuple(urllist)) Logger.getlogging().info('UPDATE comments SET key1=1 where key1 is null') SQLDAO.getinstance().execute(sql)
def updatenewsflag(self, idlist): if idlist: idlist = [item.encode(constant.CHARSET_UTF8) for item in idlist] sqlf = 'UPDATE {table} SET {key1}=1 WHERE {key1} is null and {id} in {idlist}' sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_NEWS, key1=SQLDAO.SPIDER_TABLE_NEWS_KEY1, id=SQLDAO.SPIDER_TABLE_NEWS_ID, idlist=tuple(idlist)) Logger.getlogging().info('UPDATE news SET key1=1 where key1 is null') SQLDAO.getinstance().execute(sql)
def getcount(url, before=False): if not before: return SQLDAO.getinstance().count( SQLDAO.SPIDER_TABLE_COMMENTS, {SQLDAO.SPIDER_TABLE_COMMENTS_URL: url}) else: wheref = '{urlkey}=\"{url}\" and {datekey}<={date}' where = wheref.format( urlkey=SQLDAO.SPIDER_TABLE_COMMENTS_URL, url=url, datekey=SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE, date=SQLDAO.gettime()) return SQLDAO.getinstance().count(SQLDAO.SPIDER_TABLE_COMMENTS, where=where)
def __init__(self): # 下载平台 SQLDAO.getinstance() self.downloader = Downloader() self.wdownloader = WDownloader() # ETL controller self.etl = ETLController() self.waitingperiod = int( SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_WAITING_PERIOD)) self.timeout = int(2 * int( SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_WAIT_PLATFORM_TIMEOUT))) self.spiderstarttime = int(time.time()) self.waibutimeout = 2 * 60 * 60
def dereplicate(self): #聚合相关信息后, 执行去重 #获取本次所有key1=null且channel=201的所有url为urllist #遍历urllist中的url,且key1=null,channel=202的所有id为idlist #对idlist中的kye1打标记,该条记录不在输出 idlist = [] sql = 'SELECT url from news where key1 is null and channel=201' sqlf= 'SELECT id from news where url=\"{url}\" and key1 is null and channel=202' results = SQLDAO.getinstance().execute(sql, find=True) for result in results: sql2 = sqlf.format(url=result[0]) results2 = SQLDAO.getinstance().execute(sql2, find=True) if results2: Logger.getlogging().info('dereplicated url:\t{url}'.format(url=result[0])) for result2 in results2: idlist.append(result2[0]) self.updatenewsflag(idlist)
def find(query, machineflaglist=MACHINEFLAGLIST, table=SQLDAO.SPIDER_TABLE_QUERYS): wheref = '{querykey}=\"{query}\" and {machikey} in ({machine})' where = wheref.format(querykey=SQLDAO.SPIDER_TABLE_QUERYS_QUERY, query=query, machikey=SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG, machine=','.join(machineflaglist)) return SQLDAO.getinstance().find(table, where, multi=False)
def aggregate_curcmtnum(self): #计算本次未推送过的评论数量 sqlf = 'SELECT {url},count(*) from {table} where {key1} is null group by {url}' sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_COMMENTS, url=SQLDAO.SPIDER_TABLE_COMMENTS_URL, key1=SQLDAO.SPIDER_TABLE_COMMENTS_KEY1) results = SQLDAO.getinstance().execute(sql, find=True) for result in results: key = Common.md5(result[0].strip()) if key not in self.url_curcmtnum_map: self.url_curcmtnum_map[key] = int(result[1])
def storeurl(url): id = NewsStorage.getid(url) if not NewsStorage.exist(url): data = {} data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id data[SQLDAO.SPIDER_TABLE_NEWS_URL] = url data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = SpiderConfigure.getinstance( ).getquery() data[SQLDAO. SPIDER_TABLE_NEWS_CHANNEL] = SpiderConfigure.getinstance( ).getchannel() data[SQLDAO. SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG data[SQLDAO. SPIDER_TABLE_NEWS_CREATE_DATE] = SpiderConfigure.getinstance( ).starttime() data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS, SQLDAO.SPIDER_TABLE_NEWS_KEYS, SQLDAO.getvaluesfromkeys(data))
def exist(url, content, pubdate, user): content = Common.strfilter(content) user = Common.strfilter(user) pubdate = TimeUtility.getuniformtime(pubdate) id = CMTStorage.getid(url, content, pubdate, user) if id in CMTStorage.__cidset: return True if SQLDAO.getinstance().exists(SQLDAO.SPIDER_TABLE_COMMENTS, {SQLDAO.SPIDER_TABLE_COMMENTS_ID: id}): CMTStorage.__cidset.add(id) return True return False
def aggregate_beforenewsnum(self): #计算url本次之前已经推送过的次数 #如果key1标记为1,则表示该url对应的news id已经推送过;否则表示未推送过 sqlf = 'SELECT {url},count(*) from {table} where {key1}=1 group by {url}' sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_NEWS, url=SQLDAO.SPIDER_TABLE_NEWS_URL, key1=SQLDAO.SPIDER_TABLE_NEWS_KEY1) results = SQLDAO.getinstance().execute(sql, find=True) for result in results: key = Common.md5(result[0].strip()) if key not in self.url_beforenewsnum_map: self.url_beforenewsnum_map[key] = int(result[1])
def seturlinfos(params): id = NewsStorage.getid(params.url) if NewsStorage.exist(params.url): doc = NewsStorage.getdoc(params.url) data = {} #data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter( params.title) if params.type != constant.SPIDER_S2_WEBSITE_VIDEO: data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter( params.body) if doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, TimeUtility.getintformtime( 0)) == TimeUtility.getintformtime(0): data[ SQLDAO. SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime( params.pubtime) data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id}, data) else: data = {} data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter( params.title) if params.type != constant.SPIDER_S2_WEBSITE_VIDEO: data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter( params.body) data[SQLDAO. SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime( params.pubtime) data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id data[SQLDAO.SPIDER_TABLE_NEWS_URL] = params.url data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = params.query data[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL] = params.channel data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] = params.createtime data[SQLDAO. SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS, SQLDAO.SPIDER_TABLE_NEWS_KEYS, SQLDAO.getvaluesfromkeys(data))
def dumplocalquerys(queryfile=LOCALQUERYPATH, localmachine=LOCALMACHINEFLAG): #todaymid = time.mktime(time.strptime(TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT)) where = { SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: localmachine, SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1 } results = SQLDAO.getinstance().find( SQLDAO.SPIDER_TABLE_QUERYS, where, keys=[SQLDAO.SPIDER_TABLE_QUERYS_QUERY]) querys = [''.join(item) for item in results] with open(queryfile, 'w+') as fp: fp.write('\n'.join(querys)) return querys
def aggregate_curcomments(self): #汇总本次未推送的评论 sqlf = 'SELECT {url},{content},{publish} from {table} where {key1} is null' sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_COMMENTS, url=SQLDAO.SPIDER_TABLE_COMMENTS_URL, content=SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT, publish=SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE, key1=SQLDAO.SPIDER_TABLE_COMMENTS_KEY1) cmtsresults = SQLDAO.getinstance().execute(sql, find=True) for cmtsresult in cmtsresults: urlmd5 = Common.md5(cmtsresult[0]) content = self.strfilter(cmtsresult[1]) publish = TimeUtility.getinttime(cmtsresult[2]) if urlmd5 not in self.url_curcmtcontent_map: self.url_curcmtcontent_map[urlmd5] = [] self.url_curcmtcontent_map[urlmd5].append(content + '_' + str(int(publish)))
def seturlinfo(url, key=None, value=None, data={}): id = NewsStorage.getid(url) if data: SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id}, data) return if SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE == key: value = TimeUtility.getuniformtime(value) if NewsStorage.exist(url): doc = NewsStorage.getdoc(url) tempvalue = doc.get(key, '') if tempvalue != value: data = { key: value, SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE: SQLDAO.gettime() } SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id}, data) else: data = {} data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id data[SQLDAO.SPIDER_TABLE_NEWS_URL] = url data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = SpiderConfigure.getinstance( ).getquery() data[SQLDAO. SPIDER_TABLE_NEWS_CHANNEL] = SpiderConfigure.getinstance( ).getchannel() data[SQLDAO. SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG data[SQLDAO. SPIDER_TABLE_NEWS_CREATE_DATE] = SpiderConfigure.getinstance( ).starttime() data[key] = value data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS, SQLDAO.SPIDER_TABLE_NEWS_KEYS, SQLDAO.getvaluesfromkeys(data))
def dumplocalquerys_tieba(queryfile=LOCALQUERYPATH, localmachine=LOCALMACHINEFLAG): where = { SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: localmachine, SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1 } results = SQLDAO.getinstance().find( SQLDAO.SPIDER_TABLE_QUERYS_TIEBA, where, keys=[ SQLDAO.SPIDER_TABLE_QUERYS_QUERY, SQLDAO.SPIDER_TABLE_QUERYS_QUERYURL ]) querys = ['\t'.join(item) for item in results] with open(queryfile, 'w+') as fp: fp.write('\n'.join(querys)) return querys
def storetiebaquery(self, query, queryurl, machineflaglist=MACHINEFLAGLIST_TIEBA): #查询query是否存在,如果存在则更新当前updatetime # 如果不存在则查找具有query数量最小的机器,进行query存储 query = query.strip() queryurl = queryurl.strip() result = QueryStorage.find(query, machineflaglist, table=SQLDAO.SPIDER_TABLE_QUERYS_TIEBA) if result: resultdict = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_QUERYS_KEYS, result) machine = resultdict[SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG] id = QueryStorage.getid(query, machine) SQLDAO.getinstance().update( SQLDAO.SPIDER_TABLE_QUERYS_TIEBA, {SQLDAO.SPIDER_TABLE_QUERYS_ID: id}, { SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE: SpiderConfigure.getinstance().starttime(), SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1 }) else: machine = min(self.querystorage_tieba.iteritems(), key=lambda x: x[1])[0] data = { SQLDAO.SPIDER_TABLE_QUERYS_ID: QueryStorage.getid(query, machine), SQLDAO.SPIDER_TABLE_QUERYS_QUERY: query, SQLDAO.SPIDER_TABLE_QUERYS_CREATEDATE: SpiderConfigure.getinstance().starttime(), SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE: SpiderConfigure.getinstance().starttime(), SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: machine, SQLDAO.SPIDER_TABLE_QUERYS_QUERYURL: queryurl, SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1 } SQLDAO.getinstance().insert( SQLDAO.SPIDER_TABLE_QUERYS_TIEBA, SQLDAO.SPIDER_TABLE_QUERYS_KEYS, SQLDAO.getvaluesfromkeys(data, SQLDAO.SPIDER_TABLE_QUERYS_KEYS)) #对各machine的实时存储记录 self.querystorage_tieba[machine] = self.querystorage_tieba.get( machine, 0) + 1
def writetofile(filename, cond={}): Logger.getlogging().debug( 'Now {t}, Starting Output Comments To {f}'.format(t=int( time.time()), f=filename)) for doc in SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_COMMENTS, cond): url = doc[SQLDAO.SPIDER_TABLE_COMMENTS_URL] fstring = CMTStorage.COMMENTS_FORMAT.format( channel=ChannelDao.getchannel(url), content=doc[SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT], cmtnum=CMTStorage.getcount(url), publishdate=doc[SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE], user=doc[SQLDAO.SPIDER_TABLE_COMMENTS_USER], url=doc[SQLDAO.SPIDER_TABLE_COMMENTS_URL], title=NewsStorage.gettitle(url)) FileUtility.writeline(filename, fstring.encode(CHARSET_UTF8)) FileUtility.flush() Logger.getlogging().debug( '{t} Comments Finish'.format(t=int(time.time())))
def updatedb(self): #此处注释请勿删除 #wheref = '{key1}={val1} and \ #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) > {secs}) or \ #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) > {secs}))' #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG, #time0='\"'+TimeUtility.getuniformtime(0)+'\"', #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"', #secs =self.period * 24*60*60 #) where = { SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG } results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) colddata = [] for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) try: publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE] createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate > self.period * 24*60*60) or \ (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) > self.period * 24*60*60): id = data[SQLDAO.SPIDER_TABLE_NEWS_ID] colddata.append(result) SQLDAO.getinstance().delete( SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id}) except: Logger.printexception() Logger.log(data[SQLDAO.SPIDER_TABLE_NEWS_URL], constant.ERRORCODE_WARNNING_OTHERS) if colddata: SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS_COLD, SQLDAO.SPIDER_TABLE_NEWS_KEYS, colddata, mutli=True)
def getlastpublish(url, before=True): if not before: wheref = '{urlkey}=\"{url}\"' where = wheref.format(urlkey=SQLDAO.SPIDER_TABLE_COMMENTS_URL, url=url) else: wheref = '{urlkey}=\"{url}\" and {datekey} < {date}' where = wheref.format( urlkey=SQLDAO.SPIDER_TABLE_COMMENTS_URL, url=url, datekey=SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE, date=SpiderConfigure.getinstance().starttime()) sqlf = 'SELECT MAX({key}) FROM {table} WHERE {where}' sql = sqlf.format(key=SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE, table=SQLDAO.SPIDER_TABLE_COMMENTS, where=where) results = SQLDAO.getinstance().execute(sql, find=True) if results[0][0]: return results[0][0] return TimeUtility.getintformtime(0)
def updatedb(): for matchine in QueryStorage.MACHINEFLAGLIST: SQLDAO.getinstance().update( SQLDAO.SPIDER_TABLE_QUERYS, where={ SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: matchine, SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1 }, update={SQLDAO.SPIDER_TABLE_QUERYS_VALID: 0}) SQLDAO.getinstance().update( SQLDAO.SPIDER_TABLE_QUERYS_TIEBA, where={ SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: matchine, SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1 }, update={SQLDAO.SPIDER_TABLE_QUERYS_VALID: 0}) for matchine in QueryStorage.MACHINEFLAGLIST_WAIBU: SQLDAO.getinstance().update( SQLDAO.SPIDER_TABLE_QUERYS, where={ SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: matchine, SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1 }, update={SQLDAO.SPIDER_TABLE_QUERYS_VALID: 0})
def fileformat(self): self.aggregate_beforenewsinfo() self.aggregate_beforenewsnum() self.aggregate_curcomments() self.aggregate_curcmtnum() self.aggregate_beforecmtsnum() self.dereplicate() urllist = [] idlist = [] newscond = '{key} is null'.format(key=SQLDAO.SPIDER_TABLE_NEWS_KEY1) results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where=newscond) for result in results: doc = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) id = doc[SQLDAO.SPIDER_TABLE_NEWS_ID] url = doc[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() try: urlmd5 = Common.md5(url) channel = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CHANNEL, '201') title = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TITLE, '') body = doc.get(SQLDAO.SPIDER_TABLE_NEWS_BODY, '') commentlist = self.url_curcmtcontent_map.get(urlmd5, []) comments = ' '.join(commentlist) pubtime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, TimeUtility.getintformtime(0)) crawlertime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, TimeUtility.getintformtime(0)) type = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TYPE, '') query = doc.get(SQLDAO.SPIDER_TABLE_NEWS_QUERY, '') #评论量增量推送 # 第一次推送全量:如果comments对应的内容没有被取过(key1没有标记1),则应推送全量 # 此时如果news中cmtnum>0,则推送news中的cmtnum,否则推送comment中的cmtnum(已经聚合到url_curcmtnum_map中) # 第二次推送增量:如果comments对应的内容有取过(key1有部分标记1),则应推送增量,推送comment中的cmtnum(已经聚合到url_curcmtnum_map中) cmtkey1flag = self.url_beforecmtnum_map.get(urlmd5, -1) if cmtkey1flag <= 0: cmtnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CMTNUM, -1) if cmtnum < 0: cmtnum = self.url_curcmtnum_map.get(urlmd5, 0) else: cmtnum = self.url_curcmtnum_map.get(urlmd5, 0) #其他增量 clicknum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, -1) clicknum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, clicknum) votenum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, -1) votenum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, votenum) fansnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, -1) fansnum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, fansnum) string = FileFormat.DEFAULT_NEWS_FORMAT.format(channel=channel, url=url, title=self.strfilter(title), body=self.strfilter(body), comments=comments, cmtnum=cmtnum, clicknum=clicknum, votenum=votenum, fansnum=fansnum, pubtime=TimeUtility.getinttime(pubtime), crawlertime=crawlertime, type=type, query=self.strfilter(query)) Logger.getlogging().info(u'{channel}\t{query}\t{url}'.format(channel=channel, query=query, url=url).encode(constant.CHARSET_UTF8)) if not title: FileUtility.writeline(self.errorinfopath, string.encode(constant.CHARSET_UTF8)) else: FileUtility.writeline(self.outputpath, string.encode(constant.CHARSET_UTF8)) if id not in idlist: idlist.append(id) if title and commentlist: if url not in urllist: urllist.append(url) except: Logger.getlogging().error(str(result)) Logger.printexception() #已经提取过,则变更key1标记为1 self.updatenewsflag(idlist) self.updatecommentsflag(urllist)
def dumpurls(self): #dump本台机器query对应的urllsit, 并存储到对应的文件中 s2file = SpiderConfigure.getinstance().gets2file() s2temppath = Storage.getstoragelocation( const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file) #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG) querys = QueryStorage.getinstance().getlocalquerys( s2temppath, ETLController.LOCALMACHINEFLAG) for query in querys: Logger.getlogging().debug( 'Now, Starting Select url to Insert and Update for uploading location urlfile!' ) self.conf.setchannel(constant.SPIDER_CHANNEL_S2) self.conf.setquery(query) #此处注释请勿删除 #1.转换周期内数据 # 1.1pulishdate存在,时间为最近一周 # 2.1publistdate为0,使用创建时间,时间为最近一周 #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \ #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \ #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))' #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG, #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"', #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #starttime = SpiderConfigure.getinstance().starttime(), #time0='\"'+TimeUtility.getuniformtime(0)+'\"', #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"', #secs =self.period * 24*60*60 #) where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG } Logger.getlogging().debug( 'Query condition: {where}'.format(where=str(where))) results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urltemplist = [] for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE] createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \ (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60): if url not in urltemplist: urltemplist.append(url) params = PageBasicInfo() params.url = url NewsStorage.seturlinfos(params) #2.抽取createdate为本次开始时间的数据 URLFileManager.getinstance().generateurlfilepath() where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG, SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE: SpiderConfigure.getinstance().starttime() } results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urllist = [] linecount = 0 for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() urllist.append(url) context = URLContext() context.originalurl = url context.type = URLContext.S1_MAIN_BODY context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[ SQLDAO.SPIDER_TABLE_NEWS_TYPE] Logger.getlogging().debug(url) URLManager.getinstance().storeurl(url, context, REQUEST_TYPE_WEBKIT) linecount += 1
def aggregate_beforenewsinfo(self): #如何提取已取过url的最大值 #1.首先值是有效的,并存储对应的值的抓取时间 #2.其次按抓取时间排序,取最大抓取时间对应的值 sqlf = 'SELECT {url},{createtime},{cmtnum}, {clicknum},{votenum},{fansnum} from {table} where {key1}=1' sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_NEWS, url=SQLDAO.SPIDER_TABLE_NEWS_URL, createtime=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, cmtnum=SQLDAO.SPIDER_TABLE_NEWS_CMTNUM, clicknum=SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, votenum=SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, fansnum=SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, key1=SQLDAO.SPIDER_TABLE_NEWS_KEY1) results = SQLDAO.getinstance().execute(sql, find=True) cmtnumlist = {} clicknumlist = {} votenumlist = {} fansnumlist = {} for result in results: url = result[0].strip() urlmd5 = Common.md5(url) createtime = result[1] cmtnum = result[2] clicknum = result[3] votenum = result[4] fansnum = result[5] if urlmd5 not in cmtnumlist: cmtnumlist[urlmd5] = {} if urlmd5 not in clicknumlist: clicknumlist[urlmd5] = {} if urlmd5 not in votenumlist: votenumlist[urlmd5] = {} if urlmd5 not in fansnumlist: fansnumlist[urlmd5] = {} #存储有效的值(>0)及对应的抓取时间 if cmtnum > 0: if cmtnumlist[urlmd5].get(str(createtime), 0) <= cmtnum: cmtnumlist[urlmd5][str(createtime)] = cmtnum if clicknum > 0: if clicknumlist[urlmd5].get(str(createtime), 0) <= clicknum: clicknumlist[urlmd5][str(createtime)] = clicknum if votenum > 0: if votenumlist[urlmd5].get(str(createtime), 0) <= votenum: votenumlist[urlmd5][str(createtime)] = votenum if fansnum > 0: if fansnumlist[urlmd5].get(str(createtime), 0) <= fansnum: fansnumlist[urlmd5][str(createtime)] = fansnum for urlmd5, value in cmtnumlist.iteritems(): if not value: continue self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM][urlmd5] = value[max(value)] for urlmd5, value in clicknumlist.iteritems(): if not value: continue self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM][urlmd5] = value[max(value)] for urlmd5, value in votenumlist.iteritems(): if not value: continue self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM][urlmd5] = value[max(value)] for urlmd5, value in fansnumlist.iteritems(): if not value: continue self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM][urlmd5] = value[max(value)]
def getdoc(url): value = SQLDAO.getinstance().find( SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: NewsStorage.getid(url)}, multi=False) return SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, value)
def getcount(url): id = NewsStorage.getid(url) return SQLDAO.getinstance().count(SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id})
def exist_cold(url): results = SQLDAO.getinstance().find( SQLDAO.SPIDER_TABLE_NEWS_COLD, {SQLDAO.SPIDER_TABLE_NEWS_URL: url}) if results: return True return False