Python FileUtility.getfilename Exemples, utility.fileutil.FileUtility.getfilename Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : tencentdownloader.py Projet : ErBingBing/django-tonado-crawler

 def upload(self, path):
     retans = RetransInfo()
     retans.filename = FileUtility.getfilename(path)
     if int(self.start_time) == int(time.time()):
         time.sleep(0.1)
     self.start_time = time.time()
     retans.start_time = self.start_time
     self.uploadfile_retranslist[retans.filename] = retans
     self.upload_file_list[FileUtility.getfilename(path)] = []
     cmd = TencentDownloader.UPLOAD_COMMAND.format(
         file=path,
         url=self.upload_url,
         user_id=self.taskinfo.userid,
         task_name=self.taskinfo.taskname,
         task_id=self.taskinfo.taskid)
     if self.execute(cmd):
         return True
     secs = 10
     for count in range(0, self.retrytimes):
         time.sleep(secs)
         secs *= 2
         if self.execute(cmd):
             return True
     else:
         param = NotifyParam()
         param.code = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED
         param.message = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED_MESSAGE_FORMAT.format(
             file=FileUtility.getfilename(path),
             taskid=self.taskinfo.taskid)
         SpiderNotify.notify(param)
         return False

Exemple #2

0

Afficher le fichier

 def download(self):
     files = []
     if self.completed():
         return files
     Logger.getlogging().debug(self.info.donepath)
     srclist = self.sshls(self.info.donepath)
     for donefile in srclist:
         donefile = donefile.strip()
         filename = FileUtility.getfilename(donefile)
         if donefile.endswith(
                 'done') and filename not in self.download_file_list:
             self.download_file_list.append(filename)
             for upfile in self.upload_file_list.keys():
                 if filename.startswith(upfile):
                     FileUtility.mkdirs(self.info.localdonepath)
                     self.sshdownload(donefile)
                     dfile = self.info.localdonepath + FileUtility.getfilename(
                         donefile)
                     if self.info.jsonpath:
                         dfile = self.bin2json(dfile)
                     files.append(dfile)
                     self.download_time = int(time.time())
                     self.upload_file_list.pop(upfile)
                     self.uploadfile_retranslist.pop(upfile)
                     if not FileUtility.exists(dfile):
                         Logger.getlogging().error(
                             'no json file generate from done file:{done}'.
                             format(done=dfile))
                     break
     return files

Exemple #3

0

Afficher le fichier

Fichier : localdownloader.py Projet : ErBingBing/django-tonado-crawler

 def upload(self, path):
     retans = RetransInfo()
     retans.filename = FileUtility.getfilename(path)
     retans.start_time = int(time.time())
     self.uploadfile_retranslist[retans.filename] = retans
     self.upload_file_list[FileUtility.getfilename(path)] = []
     if self.localupload(path):
         return True

Exemple #4

0

Afficher le fichier

 def download(self):
     """
     平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download()
     step1:从平台上下载数据到本地./data/platform
     step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json
     """
     files = []
     Logger.getlogging().debug('Get Valid PUC File From ' +
                               self.download_path)
     #srclist = self.getvalidfiles(self.download_path)
     srclist = FileUtility.getfilelist(self.download_path,
                                       [])[0:self.maxfilenum]
     for donefile in srclist:
         try:
             if donefile.endswith('done'):
                 Logger.getlogging().info('MOVE {file} TO {path}'.format(
                     file=donefile, path=self.done_file))
                 FileUtility.move(donefile, self.done_file)
                 binfile = os.path.join(self.done_file,
                                        FileUtility.getfilename(donefile))
                 #FileUtility.copy(donefile, self.cache_path)
                 #binfile = self.cache_path+ FileUtility.getfilename(donefile)
                 #if FileUtility.getfilesize(donefile) == FileUtility.getfilesize(binfile):
                 ##备份当天的puc文件
                 #Logger.getlogging().info('MOVE {file} TO {path}'.format(file=donefile,path=self.pucbacktoday))
                 #FileUtility.move(donefile, self.pucbacktoday)
                 #if FileUtility.exists(donefile):
                 #Logger.getlogging().error('MOVE {file} failed'.format(file=donefile))
                 #else:
                 #Logger.getlogging().error('File not equal {file}'.format(file=donefile))
                 jsonfile = self.bin2json(binfile)
                 files.append(jsonfile)
                 try:
                     self.s3puc_dumpurls(jsonfile)
                     time.sleep(0.5)
                     Logger.getlogging().debug(
                         'Remove {f}'.format(f=jsonfile))
                     FileUtility.remove(jsonfile)
                     donefile2 = os.path.join(
                         self.done_file, FileUtility.getfilename(donefile))
                     Logger.getlogging().debug(
                         'Remove {f}'.format(f=donefile2))
                     FileUtility.remove(donefile2)
                 except:
                     Logger.printexception()
                     Logger.getlogging().error(
                         'no json file generate from done file:{done}'.
                         format(done=binfile))
                     os.mknod(jsonfile)
         except:
             Logger.printexception()
     return files

Exemple #5

0

Afficher le fichier

 def upload(self, path):
     retans = RetransInfo()
     retans.filename = FileUtility.getfilename(path)
     retans.start_time = int(time.time())
     self.uploadfile_retranslist[retans.filename] = retans
     self.upload_file_list[FileUtility.getfilename(path)] = []
     if self.sshupload(path):
         return True
     else:
         param = NotifyParam()
         param.code = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED
         param.message = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED_MESSAGE_FORMAT.format(
             file=FileUtility.getfilename(path), taskid=self.info.ip)
         SpiderNotify.notify(param)
         return False

Exemple #6

0

Afficher le fichier

 def retrydownload(self, jsonfile, urlset):
     Logger.getlogging().warning(
         'upload failed urls {num}'.format(num=len(urlset)))
     context = URLFileManager.getinstance().geturlfilecontext(
         FileUtility.getfilename(jsonfile))
     if context.retry >= 2:
         Logger.getlogging().error('do not upload for failed again')
         for key in urlset.keys():
             Logger.getlogging().error(
                 'download {url} failed'.format(url=key))
     else:
         urls = []
         for key in urlset.keys():
             Logger.getlogging().warning(
                 'retry download {url}'.format(url=key))
             for i in range(0, urlset[key]):
                 urls.append(key)
         newurlfile = URLFileManager.getinstance().generateurlfilepath(
             context.retry + 1)
         Logger.getlogging().warning(
             'Retry download URL {file}'.format(file=newurlfile))
         if constant.POST_FILE_SUFFIX in jsonfile:
             URLManager.getinstance().storeurls(urls,
                                                constant.REQUEST_TYPE_POST)
         elif constant.WEBKIT_FILE_SUFFIX in jsonfile:
             URLManager.getinstance().storeurls(
                 urls, constant.REQUEST_TYPE_WEBKIT)
         else:
             URLManager.getinstance().storeurls(
                 urls, constant.REQUEST_TYPE_COMMON)

Exemple #7

0

Afficher le fichier

 def s2query(self):
     self.conf.setchannel(SPIDER_CHANNEL_S2)
     s2file = SpiderConfigure.getinstance().gets2file()
     file = FileUtility.getfilename(s2file)
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) + file
     if FileUtility.exists(s2temppath):
         with open(s2temppath, 'r') as fp:
             querylist = []
             firstline = True
             for strquery in fp.readlines():
                 if firstline:
                     firstline = False
                     if strquery[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file))
                         strquery = strquery[3:]
                 strquery = Common.strip(strquery)
                 if not strquery:
                     continue
                 Logger.getlogging().info('S2 {query} start...'.format(query=strquery))
                 self.conf.setquery(strquery)
                 URLStorage.updaterecycle()
                 querylist.append(strquery)
                 for site in self.factory.getall():
                     site.s2query(strquery.replace('&', ' '))
             sitelist = []
             for site in self.factory.getall():
                 if site.exists2():
                     sitelist.append(site)
             SpiderReport.loadquery(querylist)
             SpiderReport.loadsites(sitelist)

Exemple #8

0

Afficher le fichier

 def renewfilename(self, file):
     """"""
     filename = FileUtility.getfilename(file)
     context = URLFileManager.getinstance().geturlfilecontext(filename)
     if not context:
         return False
     if self.filetime == int(time.time()):
         time.sleep(1)
     self.filetime = int(time.time())
     newfilename = filename.replace(
         re.findall('\d+', filename)[-1], str(self.filetime))
     urlsfile = self.tempurlpath + newfilename
     context.filename = urlsfile
     URLFileManager.getinstance().updateurlfilecontext(
         FileUtility.getfilename(urlsfile), context)
     return urlsfile

Exemple #9

0

Afficher le fichier

 def __upload__(self, filepath):
     flag = True
     FileUtility.mkdirs(self.urlbackuppath)
     FileUtility.copy(filepath, self.urlbackuppath)
     self.upload_file_list[FileUtility.getfilename(filepath)] = []
     # if filepath.endswith(constant.POST_FILE_SUFFIX) or FileUtility.getfilelines(filepath) <= constant.REMOTE_DOWNLOADER_MIN_LINES:
     #     if self.limpls:
     #         if self.limplsindex >= len(self.limpls):
     #             self.limplsindex = 0
     #         flag = self.limpls[self.limplsindex].upload(filepath)
     #         self.limplsindex += 1
     if filepath.endswith(constant.WEBKIT_FILE_SUFFIX):
         if self.wimpls:
             if self.wimplsindoex >= len(self.wimpls):
                 self.wimplsindoex = 0
             self.wimpls[self.wimplsindoex].upload(filepath)
             self.wimplsindoex += 1
     elif self.impls:
         if self.implsindex >= len(self.impls):
             self.implsindex = 0
         flag = self.impls[self.implsindex].upload(filepath)
         self.implsindex += 1
     else:
         flag = False
         Logger.getlogging().warning('No taskid or download platform!')
     return flag

Exemple #10

0

Afficher le fichier

 def bin2json(self, file):
     filename = FileUtility.getfilename(file).replace('.done', '.json')
     cmd = PUCDownloader.PARSE_COMMAND.format(command=self.parse_tool,
                                              input=file,
                                              output=self.json_path,
                                              filename=filename)
     self.execute(cmd)
     return self.json_path + filename

Exemple #11

0

Afficher le fichier

 def getqueryfromdb(self):
     #指定s2 query输出文件路径
     s2file = SpiderConfigure.getinstance().gets2file()
     temppath = Storage.getstoragelocation(
         const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file)
     QueryStorage.getinstance().getlocalquerys(
         temppath, ETLController.LOCALMACHINEFLAG)
     if FileUtility.exists(temppath):
         return temppath

Exemple #12

0

Afficher le fichier

 def gettiebaqueryfromdb(self):
     #指定s2 query输出文件路径
     tiebafile = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                           const.SPIDER_S3_INPUT_FILE)
     temppath = Storage.getstoragelocation(
         const.SPIDER_TIEBA_TEMP_PATH) + FileUtility.getfilename(tiebafile)
     QueryStorage.getinstance().getlocalquerys_tieba(
         temppath, ETLController.LOCALMACHINEFLAG)
     if FileUtility.exists(temppath):
         return temppath

Exemple #13

0

Afficher le fichier

Fichier : postdowloadermock.py Projet : ErBingBing/django-tonado-crawler

 def upload(self, path):
     tencentplatform.postdownloader.PostDownloader.upload(self, path)
     filename = FileUtility.getfilename(path)
     FileUtility.mkdirs(self.download_path)
     FileUtility.copy(
         path,
         '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path,
                                                 filename=filename,
                                                 ts=int(time.time())))
     return True

Exemple #14

0

Afficher le fichier

    def waibuetl(self):
        waibubackup = SpiderConfigure.getwaibubaup()
        if not FileUtility.exists(waibubackup):
            FileUtility.mkdirs(waibubackup)

        waibufile = self.etl.getqueryfromdb()
        if not FileUtility.exists(waibufile):
            Logger.getlogging().warning(
                '{waibufile} not generate!'.format(waibufile=waibufile))
            return

        outtime = 0
        self.wdownloader.upload(waibufile)
        continueflag = True
        while continueflag:
            downloadfiles = []
            while True:
                Logger.getlogging().info(
                    'sleeping {sec}s......'.format(sec=self.waitingperiod))
                #time.sleep(self.waitingperiod)
                outtime += self.waitingperiod
                if self.wdownloader.iscompleted():
                    continueflag = False
                    break
                try:
                    downloadfiles = self.wdownloader.download()
                    if downloadfiles:
                        break
                except:
                    Logger.printexception()
                if outtime >= self.waibutimeout:
                    Logger.getlogging().warning(
                        'Waibu Data Download Timeout! Spending {sec}s'.format(
                            sec=outtime))
                    continueflag = False
                    break
            for dfile in downloadfiles:
                starttime = TimeUtility.getcurrentdate(
                    TimeUtility.TIME_FORMAT_DEFAULT)
                self.etl.wb_analysis(dfile)
                #if FileUtility.exists(waibubackup+FileUtility.getfilename(dfile)):
                #FileUtility.remove(waibubackup+FileUtility.getfilename(dfile))
                FileUtility.move(dfile, waibubackup)
                logstring = 'PROCESSWAIBUFILE:\t{file}\t{start}\t{end}'.format(
                    file=FileUtility.getfilename(dfile),
                    start=starttime,
                    end=TimeUtility.getcurrentdate())
                Logger.getlogging().info(logstring)
                if outtime >= self.waibutimeout:
                    Logger.getlogging().warning(
                        'Waibu Data Download Timeout! Spending {sec}s'.format(
                            sec=outtime))
                    continueflag = False
                    break

Exemple #15

0

Afficher le fichier

Fichier : tencentdownloader.py Projet : ErBingBing/django-tonado-crawler

 def bin2json(self, file):
     filename = FileUtility.getfilename(file).replace('.done', '.json')
     tool = self.parse_tool
     if constant.IMG_FILE_SUFFIX in filename:
         tool = self.parse_tool_img
     cmd = TencentDownloader.PARSE_COMMAND.format(command=tool,
                                                  input=file,
                                                  output=self.json_path,
                                                  filename=filename)
     self.execute(cmd)
     return self.json_path + filename

Exemple #16

0

Afficher le fichier

 def preprocess(self, filepath):
     result = False
     context = URLStorage.getfilecontext(FileUtility.getfilename(filepath))
     if context:
         self.conf.setchannel(context.channel)
         if context.channel == SPIDER_CHANNEL_S2:
             self.conf.setquery(context.query)
         else:
             self.conf.setquery('')
         URLStorage.updaterecycle()
         result = True
     return result

Exemple #17

0

Afficher le fichier

 def preprocess(self, filepath):
     result = False
     context = URLFileManager.getinstance().geturlfilecontext(
         FileUtility.getfilename(filepath))
     if context:
         self.conf.setchannel(context.channel)
         if context.channel == SPIDER_CHANNEL_S2:
             self.conf.setquery(context.query)
         else:
             self.conf.setquery('')
         URLFileManager.getinstance().generateurlfilepath()
         result = True
     return result

Exemple #18

0

Afficher le fichier

Fichier : tencentdowloadermock.py Projet : ErBingBing/django-tonado-crawler

 def upload(self, path):
     tencentplatform.tencentdownloader.TencentDownloader.upload(self, path)
     filename = FileUtility.getfilename(path)
     ts = int(time.time())
     FileUtility.mkdirs(self.download_path)
     Logger.getlogging(
     ).debug(path + '--->' + '{dir}/{filename}.txt.{ts}.done'.format(
         dir=self.download_path, filename=filename, ts=int(time.time())))
     FileUtility.copy(
         path,
         '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path,
                                                 filename=filename,
                                                 ts=int(time.time())))
     return True

Exemple #19

0

Afficher le fichier

 def backupfile(self, jsonfile):
     urlmap = {}
     splitkey = '.'
     if '_split' in jsonfile:
         splitkey = '_split'
     bkfile = self.urlbackuppath + '/' + FileUtility.getfilename(jsonfile).split(splitkey)[0]
     if FileUtility.exists(bkfile):
         with open(bkfile, 'r') as bkfh:
             for line in bkfh.readlines():
                 line = line.strip()
                 if line in urlmap:
                     urlmap[line] += 1
                 else:
                     urlmap[line] = 1
     return urlmap

Exemple #20

0

Afficher le fichier

 def upload(self, upfiles):
     Logger.getlogging().debug('uploading ......')
     for file in upfiles:
         if self.emptyfile(file):
             Logger.getlogging().info('remove empty file: ' + file)
             FileUtility.remove(file)
             continue
         if not self.__upload__(file):
             Logger.log(FileUtility.getfilename(file),
                        constant.ERRORCODE_FAIL_LOAD_UP)
             return False
         Logger.getlogging().info('remove uploadedfile' + file)
         FileUtility.remove(file)
     time.sleep(1)
     return True

Exemple #21

0

Afficher le fichier

    def retrans(self):
        if len(self.retransmissionfiles) > 0:
            # 从backup中恢复数据到temp/urls
            for fl in self.retransmissionfiles.keys():
                newfl = self.recoverfile(fl)
                newfilename = FileUtility.getfilename(newfl)
                self.all_retransmissionfiles[
                    newfilename] = self.all_retransmissionfiles[fl]
                Logger.getlogging().debug(
                    'download fail and transimission file {fl}:{num}th'.format(
                        fl=newfilename,
                        num=self.all_retransmissionfiles[newfilename].
                        retrans_num))

                impl = self.all_retransmissionfiles[newfilename].taskinfo
                if not impl.taskstatusflag:
                    if impl in self.wimpls:
                        if len(self.wimpls) != 1:
                            self.wimpls.remove(impl)
                            Logger.getlogging().warning(
                                'download fail and delete taskid is :{impl}'.
                                format(impl=impl.taskinfo.taskid))
                        else:
                            Logger.getlogging().warning(
                                'only one taskid is :{impl}'.format(
                                    impl=impl.taskinfo.taskid))
                    if impl in self.impls:
                        if len(self.impls) != 1:
                            self.impls.remove(impl)
                            Logger.getlogging().warning(
                                'download fail and delete taskid is :{impl}'.
                                format(impl=impl.taskinfo.taskid))
                        else:
                            Logger.getlogging().warning(
                                'only one taskid is :{impl}'.format(
                                    impl=impl.taskinfo.taskid))
                    if impl in self.limpls:
                        if len(self.limpls) != 1:
                            self.limpls.remove(impl)
                            Logger.getlogging().warning(
                                'download fail and delete downloadplatform is :{impl}'
                                .format(impl=impl.info.ip))
                        else:
                            Logger.getlogging().warning(
                                'only one downloadplatform is :{impl}'.format(
                                    impl=impl.info.ip))
                self.retransmissionfiles.pop(fl)

Exemple #22

0

Afficher le fichier

    def loop(self):
        # 循环URL，包括S1以及S2
        continueflag = True
        while continueflag:
            downloadfiles = []
            while True:
                # check time out
                if self.istimeout():
                    param = NotifyParam()
                    param.code = NotifyParam.SPIDER_NOTIFY_TIMEOUT
                    param.message = 'Spider timeout for %s o\'clock, stop' % constant.SPIDER_RUN_TIMEOUT_HOUR
                    SpiderNotify.notify(param)
                    continueflag = False
                    break
                if self.downloader.iscompleted():
                    continueflag = False
                    break
                try:
                    downloadfiles = self.downloader.download()
                    self.upload()
                    if len(downloadfiles) > 0:
                        break
                    else:
                        Logger.getlogging().info('sleeping {0}s......'.format(
                            self.waitingperiod))
                        time.sleep(self.waitingperiod)
                except:
                    Logger.printexception()

            for dfile in downloadfiles:
                starttime = TimeUtility.getcurrentdate(
                    TimeUtility.TIME_FORMAT_DEFAULT)
                self.etl.processfile(dfile)
                logstring = 'PROCESSFILE:\t{file}\t{start}\t{end}'.format(
                    file=FileUtility.getfilename(dfile),
                    start=starttime,
                    end=TimeUtility.getcurrentdate())
                Logger.getlogging().info(logstring)
                if self.istimeout():
                    param = NotifyParam()
                    param.code = NotifyParam.SPIDER_NOTIFY_TIMEOUT
                    param.message = 'Spider timeout for %s o\'clock, stop' % constant.SPIDER_RUN_TIMEOUT_HOUR
                    SpiderNotify.notify(param)
                    continueflag = False
                    break
                self.upload()

Exemple #23

0

Afficher le fichier

 def processfile(self, jsonfile):
     if not self.preprocess(jsonfile):
         return
     post = (constant.POST_FILE_SUFFIX in jsonfile)
     urls = self.backupfile(jsonfile)
     context = URLStorage.getfilecontext(FileUtility.getfilename(jsonfile))
     with open(jsonfile, 'r') as fp:
         lines = fp.readlines()
     for line in lines:
         param = self.analysis(line, post)
         if param is None:
             continue
         url = param.url
         if context.retry >= 2:
             param.lastretry = True
         if post:
             url = json.dumps({'url': param.url, 'data': param.data})
         else:
             Logger.getlogging().warning(url)
         info = None
         if URLStorage.hasurl(url):
             info = URLStorage.geturlcontext(url)
             param.originalurl = info.originalurl
             param.step = info.step
             param.customized = info.customized
         else:
             param.originalurl = param.url
         res = True
         if SiteS2Query.REFER_URL in param.customized:
             site = self.factory.getsite(param.customized[SiteS2Query.REFER_URL])
             res = site.process(param)
         else:
             site = self.factory.getsite(param.originalurl)
             res = site.process(param)
         if not res:
             if info:
                 URLStorage.seturlcontext(param.url, info)
         else:
             if url in urls:
                 urls[url] -= 1
                 if urls[url] == 0:
                     urls.pop(url)
     # upload failed urls
     if urls:
         self.retrydownload(jsonfile, urls)

Exemple #24

0

Afficher le fichier

 def processfile(self, jsonfile):
     if not self.preprocess(jsonfile):
         return
     method = self.requesttype(jsonfile)
     urls = self.backupfile(jsonfile)
     context = URLFileManager.getinstance().geturlfilecontext(
         FileUtility.getfilename(jsonfile))
     with open(jsonfile, 'r') as fp:
         lines = fp.readlines()
     for line in lines:
         param = self.analysis(line, method)
         if param is None:
             continue
         url = param.url
         if context.retry >= 2:
             param.lastretry = True
         if method == constant.REQUEST_TYPE_POST:
             url = json.dumps({'url': param.url, 'data': param.data})
         info = None
         if URLManager.getinstance().exist(url):
             info = URLManager.getinstance().geturlcontext(url)
             param.originalurl = info.originalurl
             param.step = info.step
             param.type = info.type
             param.customized = info.customized
         else:
             param.originalurl = param.url
             param.type = URLContext.S1_MAIN_BODY
         if SiteS2Query.REFER_URL in param.customized:
             site = self.factory.getsite(
                 param.customized[SiteS2Query.REFER_URL])
         else:
             site = self.factory.getsite(param.originalurl)
         res = site.process(param)
         if not res:
             if info:
                 URLManager.getinstance().seturlcontext(param.url, info)
         else:
             if url in urls:
                 urls[url] -= 1
                 if urls[url] == 0:
                     urls.pop(url)
     # upload failed urls
     if urls:
         self.retrydownload(jsonfile, urls)

Exemple #25

0

Afficher le fichier

 def generateurlfilepath(self, retrytimes=0):
     context = URLFileContext()
     context.channel = SpiderConfigure.getinstance().getchannel()
     context.query = SpiderConfigure.getinstance().getquery()
     context.retry = retrytimes
     # 防止生成相同的URL文件，等待1秒后重新获取时间戳
     if self.urlfiletimestamp == int(time.time()):
         time.sleep(1)
     self.urlfiletimestamp = int(time.time())
     self.urlsfile = URLFileManager.URLS_FILE_PATTERN.format(
         path=self.tempurldir,
         channel=context.channel,
         query=Common.md5(context.query),
         ts=self.urlfiletimestamp)
     context.filename = self.urlsfile
     self.urlsfilemap[FileUtility.getfilename(self.urlsfile)] = context
     Logger.getlogging().info(self.urlsfile)
     return self.urlsfile

Exemple #26

0

Afficher le fichier

 def retrydownload(self, jsonfile, urlset):
     Logger.getlogging().warning('upload failed urls {num}'.format(num=len(urlset)))
     context = URLStorage.getfilecontext(FileUtility.getfilename(jsonfile))
     if context.retry >= 2:
         Logger.getlogging().error('do not upload for failed again')
         for key in urlset.keys():
             Logger.getlogging().error('download {url} failed'.format(url=key))
     else:
         urls = []
         for key in urlset.keys():
             Logger.getlogging().warning('retry download {url}'.format(url=key))
             for i in range(0, urlset[key]):
                 urls.append(key)
         StatisticsManager.updateall(-len(urls))
         URLStorage.updaterecycle(context.retry + 1)
         if constant.POST_FILE_SUFFIX in jsonfile:
             URLStorage.storeurls(urls, constant.REQUEST_TYPE_POST)
         elif constant.WEBKIT_FILE_SUFFIX in jsonfile:
             URLStorage.storeurls(urls, constant.REQUEST_TYPE_WEBKIT)
         else:
             URLStorage.storeurls(urls, constant.REQUEST_TYPE_COMMON)

Exemple #27

0

Afficher le fichier

Fichier : waibudownloader.py Projet : ErBingBing/django-tonado-crawler

 def __download__(self):
     cmd = self.DOWNLOADCMD.format(path=self.download_path,
                                   appId=self.appid,
                                   jobId=self.jobid,
                                   token=self.token)
     if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS:
         Logger.getlogging().debug(cmd)
         return True
     if self.execute2(cmd):
         return True
     secs = 5
     for count in range(0, self.RETRYTIMES):
         time.sleep(secs)
         secs *= 2
         if self.execute2(cmd):
             return True
     else:
         param = NotifyParam()
         param.code = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED
         param.message = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED_MESSAGE_FORMAT.format(
             file=FileUtility.getfilename(self.download_path),
             taskid=self.jobid)
         SpiderNotify.notify(param)
         return False

Exemple #28

0

Afficher le fichier

 def initialize(self):
     files = ssh.sshls(self.info.ip, self.info.port, self.info.username,
                       self.info.password, self.info.donepath)
     for fl in files:
         fl = fl.strip()
         self.download_file_list.append(FileUtility.getfilename(fl))

Exemple #29

0

Afficher le fichier

 def bin2json(self, file):
     filename = FileUtility.getfilename(file).replace('.done', '.json')
     fullpath = self.info.jsonpath + filename
     FileUtility.copy(file, fullpath)
     return fullpath

Exemple #30

0

Afficher le fichier

Fichier : tencentdownloader.py Projet : ErBingBing/django-tonado-crawler

 def download(self):
     """
     平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download()
     step1:从平台上下载数据到本地./data/platform
     step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json
     """
     files = []
     if self.completed():
         return files
     Logger.getlogging().debug(self.download_path)
     srclist = FileUtility.getfilelist(self.download_path, [])
     for donefile in srclist:
         filename = FileUtility.getfilename(donefile)
         if donefile.endswith(
                 'done') and filename not in self.download_file_list:
             self.download_file_list.append(filename)
             self.download_time = time.time()
             for upfile in self.upload_file_list.keys():
                 if filename.startswith(upfile):
                     FileUtility.copy(donefile, self.cache_path)
                     binfile = self.cache_path + FileUtility.getfilename(
                         donefile)
                     if FileUtility.getfilesize(
                             donefile) == FileUtility.getfilesize(binfile):
                         Logger.getlogging().info(
                             'Remove {file}'.format(file=donefile))
                         FileUtility.remove(donefile)
                         if FileUtility.exists(donefile):
                             Logger.getlogging().error(
                                 'Remove {file} failed'.format(
                                     file=donefile))
                     else:
                         Logger.getlogging().error(
                             'File not equal {file}'.format(file=donefile))
                     jsonfile = self.bin2json(binfile)
                     files.append(jsonfile)
                     uploadtime = self.uploadfile_retranslist[
                         upfile].start_time
                     if RegexUtility.match(
                             TencentDownloader.DOWNLOAD_FORMAT1.format(
                                 file=upfile), filename):
                         self.upload_file_list.pop(upfile)
                         self.uploadfile_retranslist.pop(upfile)
                     elif RegexUtility.match(
                             TencentDownloader.DOWNLOAD_FORMAT2.format(
                                 file=upfile), filename):
                         value = \
                         RegexUtility.parse(TencentDownloader.DOWNLOAD_FORMAT2.format(file=upfile), filename)[0]
                         if value[0] == value[1]:
                             self.upload_file_list.pop(upfile)
                             self.uploadfile_retranslist.pop(upfile)
                     if not FileUtility.exists(jsonfile):
                         Logger.getlogging().error(
                             'no json file generate from done file:{done}'.
                             format(done=binfile))
                         os.mknod(jsonfile)
                     # update upload time
                     keys = self.sortkeys()
                     for fl in keys:
                         if self.uploadfile_retranslist[
                                 fl].start_time >= uploadtime:
                             self.uploadfile_retranslist[
                                 fl].start_time = time.time()
                             time.sleep(0.1)
                     break
     return files