def download(urlfilepath): whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI) donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.mkdirs(donepath) filename = os.path.basename(urlfilepath) writeTmpfile = os.path.join(donepath, filename+'.temp') writefile = os.path.join(donepath, filename + '.txt.' + str(int(time.time())) + '.done') if os.path.exists(writeTmpfile): os.remove(writeTmpfile) if os.path.exists(writefile): os.remove(writefile) httpsflag = False if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS: readlines = FileUtility.readlines(urlfilepath) for line in readlines: if line.strip().startswith('https'): httpsflag = True break #创建空文件 with open(writeTmpfile,'a+') as filetemp: filetemp.write('') if urlfilepath.endswith(constant.WEBKIT_FILE_SUFFIX) or httpsflag: downWebkit(urlfilepath, writeTmpfile) elif urlfilepath.endswith(constant.POST_FILE_SUFFIX): downPost(urlfilepath, writeTmpfile) else: downGet(urlfilepath, writeTmpfile) if os.path.exists(writeTmpfile): os.rename(writeTmpfile, writefile) Logger.getlogging().debug('DoneFile Download Success: {f}'.format(f=writefile)) FileUtility.remove(urlfilepath)
def readFile(urlpath, filename): whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN, const.SPIDER_POST_WHOAMI) donepath = SpiderConfigure.getconfig( const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.mkdirs(donepath) writeTmpfile = donepath + filename + '.tmp' now = str(time.time()).split('.')[0] writefile = donepath + filename + '.txt.' + now + '.done' if os.path.exists(writeTmpfile): os.remove(writeTmpfile) if os.path.exists(writefile): os.remove(writefile) Logger.getlogging().debug('post_done start:{f}'.format(f=writefile)) with open(urlpath, 'r') as fp: lines = fp.readlines() os.mknod(writeTmpfile) for line in lines: jsonLine = json.loads(line) try: jsonStr = downPost(jsonLine) with open(writeTmpfile, 'a+') as filetemp: filetemp.write(jsonStr + '\n') Logger.getlogging().debug( '{url}:Post request sucessed'.format(url=jsonLine['url'])) except: Logger.getlogging().warning( '{url}:Post request failed'.format(url=jsonLine['url'])) Logger.printexception() if os.path.exists(writeTmpfile): os.rename(writeTmpfile, writefile) Logger.getlogging().debug('post_done end:{f}'.format(f=writefile)) FileUtility.remove(urlpath)
def flush(): # dump s1 download failed url SpiderConfigure.getinstance().setchannel(constant.SPIDER_CHANNEL_S1) SpiderConfigure.getinstance().setquery('') for url in SpiderReport.getinstance().s1urls: Logger.log(url, constant.ERRORCODE_FAIL_LOAD_DOWN) # dump none url got from website for query querynositemap = {} for query in SpiderReport.getinstance().querysitesmap.keys(): querynositemap[query] = 0 for site in SpiderReport.getinstance().querysitesmap[query]: SpiderReport.s2queryurl(query, site, None, True) querynositemap[query] += 1 # for query in SpiderReport.getinstance().querysitesmap.keys(): if query in querynositemap: SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum, SpiderReport.getinstance().s2sitenum - querynositemap[query], True) else: SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum, SpiderReport.getinstance().s2sitenum, True) # # report filename = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_INFO_REPORT_FILE).format( date=TimeUtility.getcurrentdate()) FileUtility.remove(filename) FileUtility.writeline(filename, SpiderReport.REPORT_FORMAT.format( ch='CHANNEL', query='QUERY', type='TYPE', v1='UPLOAD', v2='DOWNLOAD', v3='NO_TEMPLATE', v4='NO_SITE', v5='WITH_CMT', v6='FAILED' )) for key in SpiderReport.getinstance().reportlist.keys(): for type in SpiderReport.getinstance().reportlist[key].keys(): r = SpiderReport.getinstance().reportlist[key][type] FileUtility.writeline(filename, r.tostring()) for key in SpiderReport.getinstance().s2sitereportlist.keys(): for type in SpiderReport.getinstance().s2sitereportlist[key].keys(): r = SpiderReport.getinstance().s2sitereportlist[key][type] FileUtility.writeline(filename, r.tostring()) FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring()) FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring2()) FileUtility.flush() threshold = float(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_FAILED_THRESHOLD)) rate = SpiderReport.getinstance().totalreport.getsuccess() if rate < threshold: Logger.getlogging().warning('success rate is lower than threshold') param = NotifyParam() param.code = NotifyParam.SPIDER_NOTIFY_OVER_FAILED param.message = 'success rate {rate} is lower than threshold {th}'.format(rate=Common.float2percent(rate), th=Common.float2percent( threshold)) SpiderNotify.notify(param)
def download(self): """ 平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download() step1:从平台上下载数据到本地./data/platform step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json """ files = [] Logger.getlogging().debug('Get Valid PUC File From ' + self.download_path) #srclist = self.getvalidfiles(self.download_path) srclist = FileUtility.getfilelist(self.download_path, [])[0:self.maxfilenum] for donefile in srclist: try: if donefile.endswith('done'): Logger.getlogging().info('MOVE {file} TO {path}'.format( file=donefile, path=self.done_file)) FileUtility.move(donefile, self.done_file) binfile = os.path.join(self.done_file, FileUtility.getfilename(donefile)) #FileUtility.copy(donefile, self.cache_path) #binfile = self.cache_path+ FileUtility.getfilename(donefile) #if FileUtility.getfilesize(donefile) == FileUtility.getfilesize(binfile): ##备份当天的puc文件 #Logger.getlogging().info('MOVE {file} TO {path}'.format(file=donefile,path=self.pucbacktoday)) #FileUtility.move(donefile, self.pucbacktoday) #if FileUtility.exists(donefile): #Logger.getlogging().error('MOVE {file} failed'.format(file=donefile)) #else: #Logger.getlogging().error('File not equal {file}'.format(file=donefile)) jsonfile = self.bin2json(binfile) files.append(jsonfile) try: self.s3puc_dumpurls(jsonfile) time.sleep(0.5) Logger.getlogging().debug( 'Remove {f}'.format(f=jsonfile)) FileUtility.remove(jsonfile) donefile2 = os.path.join( self.done_file, FileUtility.getfilename(donefile)) Logger.getlogging().debug( 'Remove {f}'.format(f=donefile2)) FileUtility.remove(donefile2) except: Logger.printexception() Logger.getlogging().error( 'no json file generate from done file:{done}'. format(done=binfile)) os.mknod(jsonfile) except: Logger.printexception() return files
def __init__(self): self.reportlist = {} self.s2sitereportlist = {} self.s2urlfilepath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_S2_QUERY_URLS_FILE).format( date=TimeUtility.getcurrentdate()) FileUtility.remove(self.s2urlfilepath) self.totalreport = Report() self.totalreport.channel = 'SUM' self.s1urls = [] self.querysitesmap = {} self.s2sitenum = 0 self.s2urlsitemap = {}
def upload(self, upfiles): Logger.getlogging().debug('uploading ......') for file in upfiles: if self.emptyfile(file): Logger.getlogging().info('remove empty file: ' + file) FileUtility.remove(file) continue if not self.__upload__(file): Logger.log(FileUtility.getfilename(file), constant.ERRORCODE_FAIL_LOAD_UP) return False Logger.getlogging().info('remove uploadedfile' + file) FileUtility.remove(file) time.sleep(1) return True
def findmax(self): filelist = FileUtility.getfilelist(self.pucbackpath, []) tf = {} for f in filelist: t = int(re.findall('(\d+)', f)[-1]) tf[t] = f if not tf: return 0 tm = max(tf.keys()) for f in filelist: t = int(re.findall('(\d+)', f)[-1]) if t < tm: Logger.getlogging().info('REMOVE {file}'.format(file=f)) FileUtility.remove(f) return tm
def download(self): """ 平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download() step1:从平台上下载数据到本地./data/platform step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json """ files = [] if self.completed(): return files Logger.getlogging().debug(self.download_path) srclist = FileUtility.getfilelist(self.download_path, []) for donefile in srclist: filename = FileUtility.getfilename(donefile) if donefile.endswith( 'done') and filename not in self.download_file_list: self.download_file_list.append(filename) self.download_time = time.time() for upfile in self.upload_file_list.keys(): if filename.startswith(upfile): FileUtility.copy(donefile, self.cache_path) binfile = self.cache_path + FileUtility.getfilename( donefile) if FileUtility.getfilesize( donefile) == FileUtility.getfilesize(binfile): Logger.getlogging().info( 'Remove {file}'.format(file=donefile)) FileUtility.remove(donefile) if FileUtility.exists(donefile): Logger.getlogging().error( 'Remove {file} failed'.format( file=donefile)) else: Logger.getlogging().error( 'File not equal {file}'.format(file=donefile)) jsonfile = self.bin2json(binfile) files.append(jsonfile) uploadtime = self.uploadfile_retranslist[ upfile].start_time if RegexUtility.match( TencentDownloader.DOWNLOAD_FORMAT1.format( file=upfile), filename): self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) elif RegexUtility.match( TencentDownloader.DOWNLOAD_FORMAT2.format( file=upfile), filename): value = \ RegexUtility.parse(TencentDownloader.DOWNLOAD_FORMAT2.format(file=upfile), filename)[0] if value[0] == value[1]: self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) if not FileUtility.exists(jsonfile): Logger.getlogging().error( 'no json file generate from done file:{done}'. format(done=binfile)) os.mknod(jsonfile) # update upload time keys = self.sortkeys() for fl in keys: if self.uploadfile_retranslist[ fl].start_time >= uploadtime: self.uploadfile_retranslist[ fl].start_time = time.time() time.sleep(0.1) break return files
def show(self): diffinfolist = {} predict = self.database.getall() instances = URLStorage.getinstances() Logger.getlogging().info( '##############################################################################################' ) Logger.getlogging().info( '%8s|%8s|%8s|%8s|%8s|%8s|%8s|%20s|%16s' % ('key', 'flag', 'cmtnum', 'clicknum', 'votenum', 'fansnum', 'realnum', 'pubtime', 'timestamp')) for ins in instances.keys(): diffinfolist[ins] = DiffInfomation() if ins != constant.SPIDER_CHANNEL_S1: diffinfolist[ins].channel = constant.SPIDER_CHANNEL_S2 diffinfolist[ins].query = ins for key in instances[ins].urlinfodict: if instances[ins].urlinfodict[key].realnum > 0: StatisticsManager.updategotcomments(1) elif instances[ins].urlinfodict[key].cmtnum > 0: StatisticsManager.updatefailgotcomment(1) if predict and key in predict: info = URLCommentInfo.fromstring(predict[key]) if not instances[ins].urlinfodict[key].isequal(info): self.printinfo(ins, info, '-') self.printinfo(ins, instances[ins].urlinfodict[key], '+') if instances[ins].urlinfodict[key].cmtnum > 0: diffinfolist[ins].deltacmt += self.diff( instances[ins].urlinfodict[key].cmtnum, info.cmtnum) else: diffinfolist[ins].deltacmt += self.diff( instances[ins].urlinfodict[key].realnum, info.realnum) diffinfolist[ins].deltaclick += self.diff( instances[ins].urlinfodict[key].clicknum, info.clicknum) diffinfolist[ins].deltavote += self.diff( instances[ins].urlinfodict[key].votenum, info.votenum) diffinfolist[ins].deltafans += self.diff( instances[ins].urlinfodict[key].fansnum, info.fansnum) else: self.printinfo(ins, instances[ins].urlinfodict[key], '+') if instances[ins].urlinfodict[key].cmtnum > 0: diffinfolist[ins].deltacmt += instances[ ins].urlinfodict[key].cmtnum else: diffinfolist[ins].deltacmt += max( 0, instances[ins].urlinfodict[key].realnum) diffinfolist[ins].deltaclick += max( 0, instances[ins].urlinfodict[key].clicknum) diffinfolist[ins].deltavote += max( 0, instances[ins].urlinfodict[key].votenum) diffinfolist[ins].deltafans += max( 0, instances[ins].urlinfodict[key].fansnum) Logger.getlogging().info( '##############################################################################################' ) if FileUtility.exists(self.difffile): FileUtility.remove(self.difffile) for key in diffinfolist.keys(): Logger.getlogging().info(diffinfolist[key].tostring()) FileUtility.writeline(self.difffile, diffinfolist[key].tostring())