Ejemplo n.º 1
0
    def waibuetl(self):
        waibubackup = SpiderConfigure.getwaibubaup()
        if not FileUtility.exists(waibubackup):
            FileUtility.mkdirs(waibubackup)

        waibufile = self.etl.getqueryfromdb()
        if not FileUtility.exists(waibufile):
            Logger.getlogging().warning(
                '{waibufile} not generate!'.format(waibufile=waibufile))
            return

        outtime = 0
        self.wdownloader.upload(waibufile)
        continueflag = True
        while continueflag:
            downloadfiles = []
            while True:
                Logger.getlogging().info(
                    'sleeping {sec}s......'.format(sec=self.waitingperiod))
                #time.sleep(self.waitingperiod)
                outtime += self.waitingperiod
                if self.wdownloader.iscompleted():
                    continueflag = False
                    break
                try:
                    downloadfiles = self.wdownloader.download()
                    if downloadfiles:
                        break
                except:
                    Logger.printexception()
                if outtime >= self.waibutimeout:
                    Logger.getlogging().warning(
                        'Waibu Data Download Timeout! Spending {sec}s'.format(
                            sec=outtime))
                    continueflag = False
                    break
            for dfile in downloadfiles:
                starttime = TimeUtility.getcurrentdate(
                    TimeUtility.TIME_FORMAT_DEFAULT)
                self.etl.wb_analysis(dfile)
                #if FileUtility.exists(waibubackup+FileUtility.getfilename(dfile)):
                #FileUtility.remove(waibubackup+FileUtility.getfilename(dfile))
                FileUtility.move(dfile, waibubackup)
                logstring = 'PROCESSWAIBUFILE:\t{file}\t{start}\t{end}'.format(
                    file=FileUtility.getfilename(dfile),
                    start=starttime,
                    end=TimeUtility.getcurrentdate())
                Logger.getlogging().info(logstring)
                if outtime >= self.waibutimeout:
                    Logger.getlogging().warning(
                        'Waibu Data Download Timeout! Spending {sec}s'.format(
                            sec=outtime))
                    continueflag = False
                    break
Ejemplo n.º 2
0
def sshdownload(host, port, username, pwd, targetFilePath, localPath):
    Logger.getlogging().info('scp -P {port} {username}@{host}:{file} {path}'.format(port=port, username=username, host=host, file=targetFilePath, path=localPath))
    ssh = SSHConnection(host, port, username, pwd)
    if ssh.connect():
        length = len(targetFilePath.split('/'))
        fileName = targetFilePath.split('/')[length - 1]
        ssh.download(targetFilePath, localPath + fileName + '.tmp')
        ssh.close()
        FileUtility.move(localPath + fileName + '.tmp', localPath + fileName)
        return True
    else:
        return False
Ejemplo n.º 3
0
 def download(self):
     """
     平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download()
     step1:从平台上下载数据到本地./data/platform
     step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json
     """
     files = []
     Logger.getlogging().debug('Get Valid PUC File From ' +
                               self.download_path)
     #srclist = self.getvalidfiles(self.download_path)
     srclist = FileUtility.getfilelist(self.download_path,
                                       [])[0:self.maxfilenum]
     for donefile in srclist:
         try:
             if donefile.endswith('done'):
                 Logger.getlogging().info('MOVE {file} TO {path}'.format(
                     file=donefile, path=self.done_file))
                 FileUtility.move(donefile, self.done_file)
                 binfile = os.path.join(self.done_file,
                                        FileUtility.getfilename(donefile))
                 #FileUtility.copy(donefile, self.cache_path)
                 #binfile = self.cache_path+ FileUtility.getfilename(donefile)
                 #if FileUtility.getfilesize(donefile) == FileUtility.getfilesize(binfile):
                 ##备份当天的puc文件
                 #Logger.getlogging().info('MOVE {file} TO {path}'.format(file=donefile,path=self.pucbacktoday))
                 #FileUtility.move(donefile, self.pucbacktoday)
                 #if FileUtility.exists(donefile):
                 #Logger.getlogging().error('MOVE {file} failed'.format(file=donefile))
                 #else:
                 #Logger.getlogging().error('File not equal {file}'.format(file=donefile))
                 jsonfile = self.bin2json(binfile)
                 files.append(jsonfile)
                 try:
                     self.s3puc_dumpurls(jsonfile)
                     time.sleep(0.5)
                     Logger.getlogging().debug(
                         'Remove {f}'.format(f=jsonfile))
                     FileUtility.remove(jsonfile)
                     donefile2 = os.path.join(
                         self.done_file, FileUtility.getfilename(donefile))
                     Logger.getlogging().debug(
                         'Remove {f}'.format(f=donefile2))
                     FileUtility.remove(donefile2)
                 except:
                     Logger.printexception()
                     Logger.getlogging().error(
                         'no json file generate from done file:{done}'.
                         format(done=binfile))
                     os.mknod(jsonfile)
         except:
             Logger.printexception()
     return files
Ejemplo n.º 4
0
 def removecachefile():
     cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY)
     databackupfolder = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                                    const.SPIDER_DATA_BACKUP_PATH) + TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT)
     if FileUtility.exists(cache):
         FileUtility.move(cache, databackupfolder)
         FileUtility.rmdir(cache)
     limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT))
     databackuppath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH)
     if FileUtility.exists(databackuppath):
         validdate = TimeUtility.getdatebefore(limit, '%Y%m%d000000')
         for s in os.listdir(databackuppath):
             fullpath = os.path.join(databackuppath, s)
             #Logger.getlogging().info('remove cach folder ' + fullpath)
             #FileUtility.rmdir(fullpath)
             if s < validdate:
                 fullpath = os.path.join(databackuppath, s)
                 Logger.getlogging().info('remove cach folder ' + fullpath)
                 FileUtility.rmdir(fullpath)
Ejemplo n.º 5
0
 def download(self):
     doneurl = TencentDownloader.DONE_FILE_URL.format(
         taskid=self.taskinfo.taskid)
     html = TencentDownloader.httpget(doneurl)
     if html:
         xparse = XPathUtility(html)
         for donefile in xparse.getlist(r'//tr/td[2]/a'):
             if donefile.endswith(
                     'done') and donefile not in self.downloadedfiles:
                 for upfile in self.upload_file_list:
                     if donefile.startswith(upfile):
                         FileUtility.mkdirs(self.download_path)
                         self.execute(
                             TencentDownloader.DOWNLOAD_COMMAND.format(
                                 taskid=self.taskinfo.taskid,
                                 filename=donefile))
                         FileUtility.move('./' + donefile,
                                          self.download_path)
                         break
                 self.downloadedfiles.append(donefile)
     return tencentdownloader.TencentDownloader.download(self)
Ejemplo n.º 6
0
def scanning():
    whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN,
                                       const.SPIDER_POST_WHOAMI)
    scanningPath = SpiderConfigure.getconfig(
        const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH)
    backupPath = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN,
                                           const.DOWNLOADER_URL_BACKUP)

    FileUtility.mkdirs(scanningPath)
    FileUtility.mkdirs(backupPath)
    flag = False
    for filename in os.listdir(scanningPath):
        fp = os.path.join(scanningPath, filename)
        backupfile = os.path.join(backupPath, filename)
        if os.path.isfile(fp) and 'tmp' not in filename:
            Logger.getlogging().info(
                'Get url file:{file}'.format(file=filename))
            FileUtility.move(fp, backupfile)
            readFile(backupfile, filename)
        if not flag:
            flag = True
    if not flag:
        time.sleep(10)
 def localdownload(self, donefile):
     FileUtility.move(donefile, self.info.localdonepath)