def waibuetl(self): waibubackup = SpiderConfigure.getwaibubaup() if not FileUtility.exists(waibubackup): FileUtility.mkdirs(waibubackup) waibufile = self.etl.getqueryfromdb() if not FileUtility.exists(waibufile): Logger.getlogging().warning( '{waibufile} not generate!'.format(waibufile=waibufile)) return outtime = 0 self.wdownloader.upload(waibufile) continueflag = True while continueflag: downloadfiles = [] while True: Logger.getlogging().info( 'sleeping {sec}s......'.format(sec=self.waitingperiod)) #time.sleep(self.waitingperiod) outtime += self.waitingperiod if self.wdownloader.iscompleted(): continueflag = False break try: downloadfiles = self.wdownloader.download() if downloadfiles: break except: Logger.printexception() if outtime >= self.waibutimeout: Logger.getlogging().warning( 'Waibu Data Download Timeout! Spending {sec}s'.format( sec=outtime)) continueflag = False break for dfile in downloadfiles: starttime = TimeUtility.getcurrentdate( TimeUtility.TIME_FORMAT_DEFAULT) self.etl.wb_analysis(dfile) #if FileUtility.exists(waibubackup+FileUtility.getfilename(dfile)): #FileUtility.remove(waibubackup+FileUtility.getfilename(dfile)) FileUtility.move(dfile, waibubackup) logstring = 'PROCESSWAIBUFILE:\t{file}\t{start}\t{end}'.format( file=FileUtility.getfilename(dfile), start=starttime, end=TimeUtility.getcurrentdate()) Logger.getlogging().info(logstring) if outtime >= self.waibutimeout: Logger.getlogging().warning( 'Waibu Data Download Timeout! Spending {sec}s'.format( sec=outtime)) continueflag = False break
def sshdownload(host, port, username, pwd, targetFilePath, localPath): Logger.getlogging().info('scp -P {port} {username}@{host}:{file} {path}'.format(port=port, username=username, host=host, file=targetFilePath, path=localPath)) ssh = SSHConnection(host, port, username, pwd) if ssh.connect(): length = len(targetFilePath.split('/')) fileName = targetFilePath.split('/')[length - 1] ssh.download(targetFilePath, localPath + fileName + '.tmp') ssh.close() FileUtility.move(localPath + fileName + '.tmp', localPath + fileName) return True else: return False
def download(self): """ 平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download() step1:从平台上下载数据到本地./data/platform step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json """ files = [] Logger.getlogging().debug('Get Valid PUC File From ' + self.download_path) #srclist = self.getvalidfiles(self.download_path) srclist = FileUtility.getfilelist(self.download_path, [])[0:self.maxfilenum] for donefile in srclist: try: if donefile.endswith('done'): Logger.getlogging().info('MOVE {file} TO {path}'.format( file=donefile, path=self.done_file)) FileUtility.move(donefile, self.done_file) binfile = os.path.join(self.done_file, FileUtility.getfilename(donefile)) #FileUtility.copy(donefile, self.cache_path) #binfile = self.cache_path+ FileUtility.getfilename(donefile) #if FileUtility.getfilesize(donefile) == FileUtility.getfilesize(binfile): ##备份当天的puc文件 #Logger.getlogging().info('MOVE {file} TO {path}'.format(file=donefile,path=self.pucbacktoday)) #FileUtility.move(donefile, self.pucbacktoday) #if FileUtility.exists(donefile): #Logger.getlogging().error('MOVE {file} failed'.format(file=donefile)) #else: #Logger.getlogging().error('File not equal {file}'.format(file=donefile)) jsonfile = self.bin2json(binfile) files.append(jsonfile) try: self.s3puc_dumpurls(jsonfile) time.sleep(0.5) Logger.getlogging().debug( 'Remove {f}'.format(f=jsonfile)) FileUtility.remove(jsonfile) donefile2 = os.path.join( self.done_file, FileUtility.getfilename(donefile)) Logger.getlogging().debug( 'Remove {f}'.format(f=donefile2)) FileUtility.remove(donefile2) except: Logger.printexception() Logger.getlogging().error( 'no json file generate from done file:{done}'. format(done=binfile)) os.mknod(jsonfile) except: Logger.printexception() return files
def removecachefile(): cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY) databackupfolder = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH) + TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT) if FileUtility.exists(cache): FileUtility.move(cache, databackupfolder) FileUtility.rmdir(cache) limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT)) databackuppath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH) if FileUtility.exists(databackuppath): validdate = TimeUtility.getdatebefore(limit, '%Y%m%d000000') for s in os.listdir(databackuppath): fullpath = os.path.join(databackuppath, s) #Logger.getlogging().info('remove cach folder ' + fullpath) #FileUtility.rmdir(fullpath) if s < validdate: fullpath = os.path.join(databackuppath, s) Logger.getlogging().info('remove cach folder ' + fullpath) FileUtility.rmdir(fullpath)
def download(self): doneurl = TencentDownloader.DONE_FILE_URL.format( taskid=self.taskinfo.taskid) html = TencentDownloader.httpget(doneurl) if html: xparse = XPathUtility(html) for donefile in xparse.getlist(r'//tr/td[2]/a'): if donefile.endswith( 'done') and donefile not in self.downloadedfiles: for upfile in self.upload_file_list: if donefile.startswith(upfile): FileUtility.mkdirs(self.download_path) self.execute( TencentDownloader.DOWNLOAD_COMMAND.format( taskid=self.taskinfo.taskid, filename=donefile)) FileUtility.move('./' + donefile, self.download_path) break self.downloadedfiles.append(donefile) return tencentdownloader.TencentDownloader.download(self)
def scanning(): whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN, const.SPIDER_POST_WHOAMI) scanningPath = SpiderConfigure.getconfig( const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH) backupPath = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN, const.DOWNLOADER_URL_BACKUP) FileUtility.mkdirs(scanningPath) FileUtility.mkdirs(backupPath) flag = False for filename in os.listdir(scanningPath): fp = os.path.join(scanningPath, filename) backupfile = os.path.join(backupPath, filename) if os.path.isfile(fp) and 'tmp' not in filename: Logger.getlogging().info( 'Get url file:{file}'.format(file=filename)) FileUtility.move(fp, backupfile) readFile(backupfile, filename) if not flag: flag = True if not flag: time.sleep(10)
def localdownload(self, donefile): FileUtility.move(donefile, self.info.localdonepath)