Esempio n. 1
0
 def __upload__(self, filepath):
     flag = True
     FileUtility.mkdirs(self.urlbackuppath)
     FileUtility.copy(filepath, self.urlbackuppath)
     self.upload_file_list[FileUtility.getfilename(filepath)] = []
     # if filepath.endswith(constant.POST_FILE_SUFFIX) or FileUtility.getfilelines(filepath) <= constant.REMOTE_DOWNLOADER_MIN_LINES:
     #     if self.limpls:
     #         if self.limplsindex >= len(self.limpls):
     #             self.limplsindex = 0
     #         flag = self.limpls[self.limplsindex].upload(filepath)
     #         self.limplsindex += 1
     if filepath.endswith(constant.WEBKIT_FILE_SUFFIX):
         if self.wimpls:
             if self.wimplsindoex >= len(self.wimpls):
                 self.wimplsindoex = 0
             self.wimpls[self.wimplsindoex].upload(filepath)
             self.wimplsindoex += 1
     elif self.impls:
         if self.implsindex >= len(self.impls):
             self.implsindex = 0
         flag = self.impls[self.implsindex].upload(filepath)
         self.implsindex += 1
     else:
         flag = False
         Logger.getlogging().warning('No taskid or download platform!')
     return flag
def scanning():
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    scanningPath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.removefiles(donepath)
    backupPath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_URL_BACKUP), TimeUtility.getcurrentdate())
    interval = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_INTERVAL)
    FileUtility.mkdirs(scanningPath)
    FileUtility.mkdirs(backupPath) 
    while True:
        Logger.getlogging().debug('scanning')
        flag = False
        for filename in os.listdir(scanningPath):
            try:
                urlfilepath = os.path.join(scanningPath, filename)
                backupfile  = os.path.join(backupPath, filename)
                if os.path.isfile(urlfilepath) and 'tmp' not in filename:
                    Logger.getlogging().info('Get url file:{file}'.format(file=filename))
                    FileUtility.copy(urlfilepath, backupfile)
                    download(urlfilepath)
                if not flag:
                    flag = True
            except:
                Logger.printexception()
        if not flag:
            Logger.getlogging().debug('scanning interval sleeping {interval}s'.format(interval=interval))
            time.sleep(int(interval))    
Esempio n. 3
0
 def copyfiles(self):
     # s1/s2输入路径
     s1file = SpiderConfigure.getinstance().gets1file()
     s2file = SpiderConfigure.getinstance().gets2file()
     # s1/s2历史路径
     self.conf.setchannel(SPIDER_CHANNEL_S1)
     s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH)
     if FileUtility.exists(s1file):
         lines = 0
         firstline = True
         with open(s1file, 'r') as fp:
             for line in fp.readlines():
                 line = line.strip()
                 if firstline:
                     firstline = False
                     if line[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file))
                         line = line[3:]
                 if line:
                     lines += 1
                     SpiderReport.puts1url(line)
         if lines > 0:
             FileUtility.copy(s1file, s1tempfile)
             SpiderReport.update(SPIDER_CHANNEL_S1, '', SpiderReport.URL_UPLOAD, lines)
     if FileUtility.exists(s2file):
         FileUtility.copy(s2file, s2temppath)
 def upload(self, path):
     tencentplatform.postdownloader.PostDownloader.upload(self, path)
     filename = FileUtility.getfilename(path)
     FileUtility.mkdirs(self.download_path)
     FileUtility.copy(
         path,
         '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path,
                                                 filename=filename,
                                                 ts=int(time.time())))
     return True
Esempio n. 5
0
 def recoverfile(self, filename):
     """"""
     # 查找,获取backup路径,再恢复到目的目录./data/temp/urls
     filelist = FileUtility.getfilelist(self.urlbackuppath, [])
     tempfilepath = os.path.join(self.urlbackuppath, filename)
     if tempfilepath in filelist:
         newfilepath = self.renewfilename(tempfilepath)
         FileUtility.copy(tempfilepath, newfilepath)
         time.sleep(0.5)
         if FileUtility.exists(newfilepath):
             return newfilepath
     return False
 def upload(self, path):
     tencentplatform.tencentdownloader.TencentDownloader.upload(self, path)
     filename = FileUtility.getfilename(path)
     ts = int(time.time())
     FileUtility.mkdirs(self.download_path)
     Logger.getlogging(
     ).debug(path + '--->' + '{dir}/{filename}.txt.{ts}.done'.format(
         dir=self.download_path, filename=filename, ts=int(time.time())))
     FileUtility.copy(
         path,
         '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path,
                                                 filename=filename,
                                                 ts=int(time.time())))
     return True
Esempio n. 7
0
 def copyfiles(self):
     # s1/s2输入路径
     s1file = SpiderConfigure.getinstance().gets1file()
     s2file = SpiderConfigure.getinstance().gets2file()
     # s1/s2历史路径
     self.conf.setchannel(SPIDER_CHANNEL_S1)
     # s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH)
     if FileUtility.exists(s1file):
         lines = 0
         firstline = True
         with open(s1file, 'r') as fp:
             rows = []
             for line in fp.readlines():
                 line = line.strip()
                 if firstline:
                     firstline = False
                     if line[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning(
                             'Remove BOM from {file}!'.format(file=file))
                         line = line[3:]
                 if line:
                     lines += 1
                     rows.append(line)
                 if lines % constant.SPIDER_S1_MAX_LINE_PER_FILE == 0:
                     s1tempfile = URLFileManager.generateurlfilepath(
                     ) + constant.WEBKIT_FILE_SUFFIX
                     FileUtility.writelines(s1tempfile, rows)
                     rows = []
             if rows:
                 s1tempfile = URLFileManager.generateurlfilepath(
                 ) + constant.WEBKIT_FILE_SUFFIX
                 FileUtility.writelines(s1tempfile, rows)
                 rows = []
     if FileUtility.exists(s2file):
         FileUtility.copy(s2file, s2temppath)
Esempio n. 8
0
 def bin2json(self, file):
     filename = FileUtility.getfilename(file).replace('.done', '.json')
     fullpath = self.info.jsonpath + filename
     FileUtility.copy(file, fullpath)
     return fullpath
 def download(self):
     """
     平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download()
     step1:从平台上下载数据到本地./data/platform
     step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json
     """
     files = []
     if self.completed():
         return files
     Logger.getlogging().debug(self.download_path)
     srclist = FileUtility.getfilelist(self.download_path, [])
     for donefile in srclist:
         filename = FileUtility.getfilename(donefile)
         if donefile.endswith(
                 'done') and filename not in self.download_file_list:
             self.download_file_list.append(filename)
             self.download_time = time.time()
             for upfile in self.upload_file_list.keys():
                 if filename.startswith(upfile):
                     FileUtility.copy(donefile, self.cache_path)
                     binfile = self.cache_path + FileUtility.getfilename(
                         donefile)
                     if FileUtility.getfilesize(
                             donefile) == FileUtility.getfilesize(binfile):
                         Logger.getlogging().info(
                             'Remove {file}'.format(file=donefile))
                         FileUtility.remove(donefile)
                         if FileUtility.exists(donefile):
                             Logger.getlogging().error(
                                 'Remove {file} failed'.format(
                                     file=donefile))
                     else:
                         Logger.getlogging().error(
                             'File not equal {file}'.format(file=donefile))
                     jsonfile = self.bin2json(binfile)
                     files.append(jsonfile)
                     uploadtime = self.uploadfile_retranslist[
                         upfile].start_time
                     if RegexUtility.match(
                             TencentDownloader.DOWNLOAD_FORMAT1.format(
                                 file=upfile), filename):
                         self.upload_file_list.pop(upfile)
                         self.uploadfile_retranslist.pop(upfile)
                     elif RegexUtility.match(
                             TencentDownloader.DOWNLOAD_FORMAT2.format(
                                 file=upfile), filename):
                         value = \
                         RegexUtility.parse(TencentDownloader.DOWNLOAD_FORMAT2.format(file=upfile), filename)[0]
                         if value[0] == value[1]:
                             self.upload_file_list.pop(upfile)
                             self.uploadfile_retranslist.pop(upfile)
                     if not FileUtility.exists(jsonfile):
                         Logger.getlogging().error(
                             'no json file generate from done file:{done}'.
                             format(done=binfile))
                         os.mknod(jsonfile)
                     # update upload time
                     keys = self.sortkeys()
                     for fl in keys:
                         if self.uploadfile_retranslist[
                                 fl].start_time >= uploadtime:
                             self.uploadfile_retranslist[
                                 fl].start_time = time.time()
                             time.sleep(0.1)
                     break
     return files
 def bin2json(self, file):
     jsonfile = tencentplatform.tencentdownloader.TencentDownloader.bin2json(
         self, file)
     FileUtility.copy(file, jsonfile)
     return jsonfile
 def localupload(self,path):
     srcfilepath = os.path.join(self.info.urlpath, FileUtility.getfilename(path))   
     FileUtility.copy(path, srcfilepath)
     #autodownloader.download(path)
     return True
 def sshdownload(self, donefile):
     Logger.getlogging().info('sshdownload:' + donefile)
     FileUtility.copy(donefile, self.info.localdonepath)