def flush(): # dump s1 download failed url SpiderConfigure.getinstance().setchannel(constant.SPIDER_CHANNEL_S1) SpiderConfigure.getinstance().setquery('') for url in SpiderReport.getinstance().s1urls: Logger.log(url, constant.ERRORCODE_FAIL_LOAD_DOWN) # dump none url got from website for query querynositemap = {} for query in SpiderReport.getinstance().querysitesmap.keys(): querynositemap[query] = 0 for site in SpiderReport.getinstance().querysitesmap[query]: SpiderReport.s2queryurl(query, site, None, True) querynositemap[query] += 1 # for query in SpiderReport.getinstance().querysitesmap.keys(): if query in querynositemap: SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum, SpiderReport.getinstance().s2sitenum - querynositemap[query], True) else: SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum, SpiderReport.getinstance().s2sitenum, True) # # report filename = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_INFO_REPORT_FILE).format( date=TimeUtility.getcurrentdate()) FileUtility.remove(filename) FileUtility.writeline(filename, SpiderReport.REPORT_FORMAT.format( ch='CHANNEL', query='QUERY', type='TYPE', v1='UPLOAD', v2='DOWNLOAD', v3='NO_TEMPLATE', v4='NO_SITE', v5='WITH_CMT', v6='FAILED' )) for key in SpiderReport.getinstance().reportlist.keys(): for type in SpiderReport.getinstance().reportlist[key].keys(): r = SpiderReport.getinstance().reportlist[key][type] FileUtility.writeline(filename, r.tostring()) for key in SpiderReport.getinstance().s2sitereportlist.keys(): for type in SpiderReport.getinstance().s2sitereportlist[key].keys(): r = SpiderReport.getinstance().s2sitereportlist[key][type] FileUtility.writeline(filename, r.tostring()) FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring()) FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring2()) FileUtility.flush() threshold = float(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_FAILED_THRESHOLD)) rate = SpiderReport.getinstance().totalreport.getsuccess() if rate < threshold: Logger.getlogging().warning('success rate is lower than threshold') param = NotifyParam() param.code = NotifyParam.SPIDER_NOTIFY_OVER_FAILED param.message = 'success rate {rate} is lower than threshold {th}'.format(rate=Common.float2percent(rate), th=Common.float2percent( threshold)) SpiderNotify.notify(param)
def writetofile(filename, cond={}): Logger.getlogging().debug( 'Now {t}, Starting Output Comments To {f}'.format(t=int( time.time()), f=filename)) for doc in SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_COMMENTS, cond): url = doc[SQLDAO.SPIDER_TABLE_COMMENTS_URL] fstring = CMTStorage.COMMENTS_FORMAT.format( channel=ChannelDao.getchannel(url), content=doc[SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT], cmtnum=CMTStorage.getcount(url), publishdate=doc[SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE], user=doc[SQLDAO.SPIDER_TABLE_COMMENTS_USER], url=doc[SQLDAO.SPIDER_TABLE_COMMENTS_URL], title=NewsStorage.gettitle(url)) FileUtility.writeline(filename, fstring.encode(CHARSET_UTF8)) FileUtility.flush() Logger.getlogging().debug( '{t} Comments Finish'.format(t=int(time.time())))
def upload(self): FileUtility.flush() upfiles = FileUtility.getfilelist( Storage.getstoragelocation(const.SPIDER_URLS_TEMP_PATH), []) return self.downloader.upload(upfiles)