Python FileUtilityの例、utility.fileutil.FileUtility Pythonの例

コード例 #1

0

ファイルを表示

ファイル: urlmanager.py プロジェクト: ErBingBing/django-tonado-crawler

 def storeurls(self, urls, request=constant.REQUEST_TYPE_COMMON):
     urlfile = URLFileManager.getinstance().geturlfilepath(request)
     if FileUtility.geturlfilelines(urlfile) + len(
             urls) > URLFileManager.URL_FILE_LINES_MAX_NUMBER:
         URLFileManager.getinstance().generateurlfilepath()
         urlfile = URLFileManager.getinstance().geturlfilepath(request)
     FileUtility.writelines(urlfile, urls)

コード例 #2

0

ファイルを表示

 def s2query(self):
     self.conf.setchannel(SPIDER_CHANNEL_S2)
     s2file = SpiderConfigure.getinstance().gets2file()
     file = FileUtility.getfilename(s2file)
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) + file
     if FileUtility.exists(s2temppath):
         with open(s2temppath, 'r') as fp:
             querylist = []
             firstline = True
             for strquery in fp.readlines():
                 if firstline:
                     firstline = False
                     if strquery[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file))
                         strquery = strquery[3:]
                 strquery = Common.strip(strquery)
                 if not strquery:
                     continue
                 Logger.getlogging().info('S2 {query} start...'.format(query=strquery))
                 self.conf.setquery(strquery)
                 URLStorage.updaterecycle()
                 querylist.append(strquery)
                 for site in self.factory.getall():
                     site.s2query(strquery.replace('&', ' '))
             sitelist = []
             for site in self.factory.getall():
                 if site.exists2():
                     sitelist.append(site)
             SpiderReport.loadquery(querylist)
             SpiderReport.loadsites(sitelist)

コード例 #3

0

ファイルを表示

 def __upload__(self, filepath):
     flag = True
     FileUtility.mkdirs(self.urlbackuppath)
     FileUtility.copy(filepath, self.urlbackuppath)
     self.upload_file_list[FileUtility.getfilename(filepath)] = []
     # if filepath.endswith(constant.POST_FILE_SUFFIX) or FileUtility.getfilelines(filepath) <= constant.REMOTE_DOWNLOADER_MIN_LINES:
     #     if self.limpls:
     #         if self.limplsindex >= len(self.limpls):
     #             self.limplsindex = 0
     #         flag = self.limpls[self.limplsindex].upload(filepath)
     #         self.limplsindex += 1
     if filepath.endswith(constant.WEBKIT_FILE_SUFFIX):
         if self.wimpls:
             if self.wimplsindoex >= len(self.wimpls):
                 self.wimplsindoex = 0
             self.wimpls[self.wimplsindoex].upload(filepath)
             self.wimplsindoex += 1
     elif self.impls:
         if self.implsindex >= len(self.impls):
             self.implsindex = 0
         flag = self.impls[self.implsindex].upload(filepath)
         self.implsindex += 1
     else:
         flag = False
         Logger.getlogging().warning('No taskid or download platform!')
     return flag

コード例 #4

0

ファイルを表示

ファイル: autodownloader.py プロジェクト: ErBingBing/django-tonado-crawler

def download(urlfilepath):
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.mkdirs(donepath)  
    filename = os.path.basename(urlfilepath)
    writeTmpfile = os.path.join(donepath, filename+'.temp')
    writefile = os.path.join(donepath, filename + '.txt.' + str(int(time.time())) + '.done')
    if os.path.exists(writeTmpfile):
        os.remove(writeTmpfile)
    if os.path.exists(writefile):
        os.remove(writefile) 
    httpsflag = False
    if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS:
        readlines = FileUtility.readlines(urlfilepath)
        for line in readlines:
            if line.strip().startswith('https'):
                httpsflag = True
                break
    #创建空文件
    with open(writeTmpfile,'a+') as filetemp:
        filetemp.write('')    
    if urlfilepath.endswith(constant.WEBKIT_FILE_SUFFIX) or httpsflag:
        downWebkit(urlfilepath, writeTmpfile)
    elif urlfilepath.endswith(constant.POST_FILE_SUFFIX):
        downPost(urlfilepath, writeTmpfile)
    else:
        downGet(urlfilepath, writeTmpfile)
    if os.path.exists(writeTmpfile):
        os.rename(writeTmpfile, writefile)
        Logger.getlogging().debug('DoneFile Download Success: {f}'.format(f=writefile))
    FileUtility.remove(urlfilepath)

コード例 #5

0

ファイルを表示

ファイル: tencentdownloader.py プロジェクト: ErBingBing/django-tonado-crawler

 def upload(self, path):
     retans = RetransInfo()
     retans.filename = FileUtility.getfilename(path)
     if int(self.start_time) == int(time.time()):
         time.sleep(0.1)
     self.start_time = time.time()
     retans.start_time = self.start_time
     self.uploadfile_retranslist[retans.filename] = retans
     self.upload_file_list[FileUtility.getfilename(path)] = []
     cmd = TencentDownloader.UPLOAD_COMMAND.format(
         file=path,
         url=self.upload_url,
         user_id=self.taskinfo.userid,
         task_name=self.taskinfo.taskname,
         task_id=self.taskinfo.taskid)
     if self.execute(cmd):
         return True
     secs = 10
     for count in range(0, self.retrytimes):
         time.sleep(secs)
         secs *= 2
         if self.execute(cmd):
             return True
     else:
         param = NotifyParam()
         param.code = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED
         param.message = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED_MESSAGE_FORMAT.format(
             file=FileUtility.getfilename(path),
             taskid=self.taskinfo.taskid)
         SpiderNotify.notify(param)
         return False

コード例 #6

0

ファイルを表示

ファイル: downloader.py プロジェクト: ErBingBing/django-tonado-crawler

def readFile(urlpath, filename):
    whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN,
                                       const.SPIDER_POST_WHOAMI)
    donepath = SpiderConfigure.getconfig(
        const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.mkdirs(donepath)
    writeTmpfile = donepath + filename + '.tmp'
    now = str(time.time()).split('.')[0]
    writefile = donepath + filename + '.txt.' + now + '.done'
    if os.path.exists(writeTmpfile):
        os.remove(writeTmpfile)
    if os.path.exists(writefile):
        os.remove(writefile)
    Logger.getlogging().debug('post_done start:{f}'.format(f=writefile))
    with open(urlpath, 'r') as fp:
        lines = fp.readlines()
        os.mknod(writeTmpfile)
        for line in lines:
            jsonLine = json.loads(line)
            try:
                jsonStr = downPost(jsonLine)
                with open(writeTmpfile, 'a+') as filetemp:
                    filetemp.write(jsonStr + '\n')
                Logger.getlogging().debug(
                    '{url}:Post request sucessed'.format(url=jsonLine['url']))
            except:
                Logger.getlogging().warning(
                    '{url}:Post request failed'.format(url=jsonLine['url']))
                Logger.printexception()
    if os.path.exists(writeTmpfile):
        os.rename(writeTmpfile, writefile)
        Logger.getlogging().debug('post_done end:{f}'.format(f=writefile))
    FileUtility.remove(urlpath)

コード例 #7

0

ファイルを表示

 def download(self):
     files = []
     if self.completed():
         return files
     Logger.getlogging().debug(self.info.donepath)
     srclist = self.sshls(self.info.donepath)
     for donefile in srclist:
         donefile = donefile.strip()
         filename = FileUtility.getfilename(donefile)
         if donefile.endswith(
                 'done') and filename not in self.download_file_list:
             self.download_file_list.append(filename)
             for upfile in self.upload_file_list.keys():
                 if filename.startswith(upfile):
                     FileUtility.mkdirs(self.info.localdonepath)
                     self.sshdownload(donefile)
                     dfile = self.info.localdonepath + FileUtility.getfilename(
                         donefile)
                     if self.info.jsonpath:
                         dfile = self.bin2json(dfile)
                     files.append(dfile)
                     self.download_time = int(time.time())
                     self.upload_file_list.pop(upfile)
                     self.uploadfile_retranslist.pop(upfile)
                     if not FileUtility.exists(dfile):
                         Logger.getlogging().error(
                             'no json file generate from done file:{done}'.
                             format(done=dfile))
                     break
     return files

コード例 #8

0

ファイルを表示

ファイル: localdownloader.py プロジェクト: ErBingBing/django-tonado-crawler

 def upload(self, path):
     retans = RetransInfo()
     retans.filename = FileUtility.getfilename(path)
     retans.start_time = int(time.time())
     self.uploadfile_retranslist[retans.filename] = retans
     self.upload_file_list[FileUtility.getfilename(path)] = []
     if self.localupload(path):
         return True

コード例 #9

0

ファイルを表示

 def getqueryfromdb(self):
     #指定s2 query输出文件路径
     s2file = SpiderConfigure.getinstance().gets2file()
     temppath = Storage.getstoragelocation(
         const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file)
     QueryStorage.getinstance().getlocalquerys(
         temppath, ETLController.LOCALMACHINEFLAG)
     if FileUtility.exists(temppath):
         return temppath

コード例 #10

0

ファイルを表示

ファイル: postdowloadermock.py プロジェクト: ErBingBing/django-tonado-crawler

 def upload(self, path):
     tencentplatform.postdownloader.PostDownloader.upload(self, path)
     filename = FileUtility.getfilename(path)
     FileUtility.mkdirs(self.download_path)
     FileUtility.copy(
         path,
         '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path,
                                                 filename=filename,
                                                 ts=int(time.time())))
     return True

コード例 #11

0

ファイルを表示

 def gettiebaqueryfromdb(self):
     #指定s2 query输出文件路径
     tiebafile = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                           const.SPIDER_S3_INPUT_FILE)
     temppath = Storage.getstoragelocation(
         const.SPIDER_TIEBA_TEMP_PATH) + FileUtility.getfilename(tiebafile)
     QueryStorage.getinstance().getlocalquerys_tieba(
         temppath, ETLController.LOCALMACHINEFLAG)
     if FileUtility.exists(temppath):
         return temppath

コード例 #12

0

ファイルを表示

 def recoverfile(self, filename):
     """"""
     # 查找，获取backup路径,再恢复到目的目录./data/temp/urls
     filelist = FileUtility.getfilelist(self.urlbackuppath, [])
     tempfilepath = os.path.join(self.urlbackuppath, filename)
     if tempfilepath in filelist:
         newfilepath = self.renewfilename(tempfilepath)
         FileUtility.copy(tempfilepath, newfilepath)
         time.sleep(0.5)
         if FileUtility.exists(newfilepath):
             return newfilepath
     return False

コード例 #13

0

ファイルを表示

ファイル: urlmanager.py プロジェクト: ErBingBing/django-tonado-crawler

 def storeurl(self, url, urlcontext, request=constant.REQUEST_TYPE_COMMON):
     if url.strip():
         urlfile = URLFileManager.getinstance().geturlfilepath(request)
         if FileUtility.geturlfilelines(
                 urlfile) + 1 > URLFileManager.URL_FILE_LINES_MAX_NUMBER:
             URLFileManager.getinstance().generateurlfilepath()
             urlfile = URLFileManager.getinstance().geturlfilepath(request)
         FileUtility.writeline(urlfile, url)
         key = Common.md5(url.strip())
         if key not in self.urlcontextdict:
             self.urlcontextdict[key] = []
         self.urlcontextdict[key].append(urlcontext)

コード例 #14

0

ファイルを表示

ファイル: ssh.py プロジェクト: ErBingBing/django-tonado-crawler

def sshdownload(host, port, username, pwd, targetFilePath, localPath):
    Logger.getlogging().info('scp -P {port} {username}@{host}:{file} {path}'.format(port=port, username=username, host=host, file=targetFilePath, path=localPath))
    ssh = SSHConnection(host, port, username, pwd)
    if ssh.connect():
        length = len(targetFilePath.split('/'))
        fileName = targetFilePath.split('/')[length - 1]
        ssh.download(targetFilePath, localPath + fileName + '.tmp')
        ssh.close()
        FileUtility.move(localPath + fileName + '.tmp', localPath + fileName)
        return True
    else:
        return False

コード例 #15

0

ファイルを表示

 def __download__(self, downloaderlist):
     valid_json_files = []
     for impl in downloaderlist:
         json_files = impl.download()
         for dfile in json_files:
             for ufile in self.upload_file_list.keys():
                 if RegexUtility.match(
                         Downloader.DOWNLOAD_FORMAT1.format(file=ufile),
                         dfile):
                     self.upload_file_list.pop(ufile)
                     if FileUtility.exists(dfile):
                         valid_json_files.append(dfile)
                         Logger.getlogging().info('downloadedjsonfile\t' +
                                                  dfile)
                 elif RegexUtility.match(
                         Downloader.DOWNLOAD_FORMAT2.format(file=ufile),
                         dfile):
                     value = RegexUtility.parse(
                         Downloader.DOWNLOAD_FORMAT2.format(file=ufile),
                         dfile)[0]
                     if FileUtility.exists(dfile):
                         valid_json_files.append(dfile)
                         Logger.getlogging().info('downloadedjsonfile\t' +
                                                  dfile)
                     if value[0] == value[1]:
                         self.upload_file_list.pop(ufile)
         retransmissionfiles = impl.outtimefiles()
         for fl in retransmissionfiles.keys():
             # 下载异常
             if fl not in self.all_retransmissionfiles:
                 self.all_retransmissionfiles[fl] = retransmissionfiles[fl]
             self.all_retransmissionfiles[fl].retrans_num += 1
             self.all_retransmissionfiles[fl].taskinfo = impl
             self.retransmissionfiles[fl] = self.all_retransmissionfiles[fl]
             if self.retransmissionfiles[
                     fl].retrans_num <= self.retransmissionlimitnum:
                 # 虽然下载失败了，但假装已下载，故在upload_file_list删除
                 self.upload_file_list.pop(fl)
                 Logger.getlogging().debug(
                     'download fail file {fl}:{num}th fail'.format(
                         fl=fl,
                         num=self.all_retransmissionfiles[fl].retrans_num))
             else:
                 # 虽然下载失败了，但假装已下载，故在upload_file_list删除;不再重传，在重传列表中删除
                 self.upload_file_list.pop(fl)
                 self.retransmissionfiles.pop(fl)
                 Logger.getlogging().debug(
                     'download fail file {fl}:more then {num}th fail'.
                     format(
                         fl=fl,
                         num=self.all_retransmissionfiles[fl].retrans_num -
                         1))
     return valid_json_files

コード例 #16

0

ファイルを表示

ファイル: inforeport.py プロジェクト: ErBingBing/django-tonado-crawler

 def s2queryurl(query, website, url, onlywrite=False):
     sitename = str(website)
     if '.' in sitename:
         sitename = sitename[sitename.rindex('.') + 1:]
     if not onlywrite:
         SpiderReport.removequerysite(query, sitename)
         SpiderReport.getinstance().s2urlsitemap[Common.md5(url.strip())] = sitename
         SpiderReport.updates2site(query, sitename, SpiderReport.URL_UPLOAD, 1)
     FileUtility.writeline(SpiderReport.getinstance().s2urlfilepath, SpiderReport.S2URL_FORMAT.format(
         query=query,
         website=sitename,
         url=url
     ))

コード例 #17

0

ファイルを表示

ファイル: inforeport.py プロジェクト: ErBingBing/django-tonado-crawler

 def __init__(self):
     self.reportlist = {}
     self.s2sitereportlist = {}
     self.s2urlfilepath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                                    const.SPIDER_S2_QUERY_URLS_FILE).format(
         date=TimeUtility.getcurrentdate())
     FileUtility.remove(self.s2urlfilepath)
     self.totalreport = Report()
     self.totalreport.channel = 'SUM'
     self.s1urls = []
     self.querysitesmap = {}
     self.s2sitenum = 0
     self.s2urlsitemap = {}

コード例 #18

0

ファイルを表示

ファイル: tencentdowloadermock.py プロジェクト: ErBingBing/django-tonado-crawler

 def upload(self, path):
     tencentplatform.tencentdownloader.TencentDownloader.upload(self, path)
     filename = FileUtility.getfilename(path)
     ts = int(time.time())
     FileUtility.mkdirs(self.download_path)
     Logger.getlogging(
     ).debug(path + '--->' + '{dir}/{filename}.txt.{ts}.done'.format(
         dir=self.download_path, filename=filename, ts=int(time.time())))
     FileUtility.copy(
         path,
         '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path,
                                                 filename=filename,
                                                 ts=int(time.time())))
     return True

コード例 #19

0

ファイルを表示

 def s2upload(self, sfile):
     if FileUtility.exists(sfile):
         lines = FileUtility.readlines(sfile)
         for line in lines:
             try:
                 query = line.strip()
                 self.conf.setchannel(SPIDER_CHANNEL_S2)
                 self.conf.setquery(query)
                 URLFileManager.getinstance().generateurlfilepath()
                 allsite = self.factory.getall()
                 for site in allsite:
                     site.s2query(query)
             except:
                 Logger.printexception()

コード例 #20

0

ファイルを表示

ファイル: autodownloader.py プロジェクト: ErBingBing/django-tonado-crawler

def scanning():
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    scanningPath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.removefiles(donepath)
    backupPath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_URL_BACKUP), TimeUtility.getcurrentdate())
    interval = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_INTERVAL)
    FileUtility.mkdirs(scanningPath)
    FileUtility.mkdirs(backupPath) 
    while True:
        Logger.getlogging().debug('scanning')
        flag = False
        for filename in os.listdir(scanningPath):
            try:
                urlfilepath = os.path.join(scanningPath, filename)
                backupfile  = os.path.join(backupPath, filename)
                if os.path.isfile(urlfilepath) and 'tmp' not in filename:
                    Logger.getlogging().info('Get url file:{file}'.format(file=filename))
                    FileUtility.copy(urlfilepath, backupfile)
                    download(urlfilepath)
                if not flag:
                    flag = True
            except:
                Logger.printexception()
        if not flag:
            Logger.getlogging().debug('scanning interval sleeping {interval}s'.format(interval=interval))
            time.sleep(int(interval))

コード例 #21

0

ファイルを表示

 def upload(self, path):
     retans = RetransInfo()
     retans.filename = FileUtility.getfilename(path)
     retans.start_time = int(time.time())
     self.uploadfile_retranslist[retans.filename] = retans
     self.upload_file_list[FileUtility.getfilename(path)] = []
     if self.sshupload(path):
         return True
     else:
         param = NotifyParam()
         param.code = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED
         param.message = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED_MESSAGE_FORMAT.format(
             file=FileUtility.getfilename(path), taskid=self.info.ip)
         SpiderNotify.notify(param)
         return False

コード例 #22

0

ファイルを表示

 def findmax(self):
     filelist = FileUtility.getfilelist(self.pucbackpath, [])
     tf = {}
     for f in filelist:
         t = int(re.findall('(\d+)', f)[-1])
         tf[t] = f
     if not tf:
         return 0
     tm = max(tf.keys())
     for f in filelist:
         t = int(re.findall('(\d+)', f)[-1])
         if t < tm:
             Logger.getlogging().info('REMOVE {file}'.format(file=f))
             FileUtility.remove(f)
     return tm

コード例 #23

0

ファイルを表示

 def backupfile(self, jsonfile):
     urlmap = {}
     splitkey = '.'
     if '_split' in jsonfile:
         splitkey = '_split'
     bkfile = self.urlbackuppath + '/' + FileUtility.getfilename(jsonfile).split(splitkey)[0]
     if FileUtility.exists(bkfile):
         with open(bkfile, 'r') as bkfh:
             for line in bkfh.readlines():
                 line = line.strip()
                 if line in urlmap:
                     urlmap[line] += 1
                 else:
                     urlmap[line] = 1
     return urlmap

コード例 #24

0

ファイルを表示

 def upload(self, upfiles):
     Logger.getlogging().debug('uploading ......')
     for file in upfiles:
         if self.emptyfile(file):
             Logger.getlogging().info('remove empty file: ' + file)
             FileUtility.remove(file)
             continue
         if not self.__upload__(file):
             Logger.log(FileUtility.getfilename(file),
                        constant.ERRORCODE_FAIL_LOAD_UP)
             return False
         Logger.getlogging().info('remove uploadedfile' + file)
         FileUtility.remove(file)
     time.sleep(1)
     return True

コード例 #25

0

ファイルを表示

 def retrydownload(self, jsonfile, urlset):
     Logger.getlogging().warning(
         'upload failed urls {num}'.format(num=len(urlset)))
     context = URLFileManager.getinstance().geturlfilecontext(
         FileUtility.getfilename(jsonfile))
     if context.retry >= 2:
         Logger.getlogging().error('do not upload for failed again')
         for key in urlset.keys():
             Logger.getlogging().error(
                 'download {url} failed'.format(url=key))
     else:
         urls = []
         for key in urlset.keys():
             Logger.getlogging().warning(
                 'retry download {url}'.format(url=key))
             for i in range(0, urlset[key]):
                 urls.append(key)
         newurlfile = URLFileManager.getinstance().generateurlfilepath(
             context.retry + 1)
         Logger.getlogging().warning(
             'Retry download URL {file}'.format(file=newurlfile))
         if constant.POST_FILE_SUFFIX in jsonfile:
             URLManager.getinstance().storeurls(urls,
                                                constant.REQUEST_TYPE_POST)
         elif constant.WEBKIT_FILE_SUFFIX in jsonfile:
             URLManager.getinstance().storeurls(
                 urls, constant.REQUEST_TYPE_WEBKIT)
         else:
             URLManager.getinstance().storeurls(
                 urls, constant.REQUEST_TYPE_COMMON)

コード例 #26

0

ファイルを表示

 def renewfilename(self, file):
     """"""
     filename = FileUtility.getfilename(file)
     context = URLFileManager.getinstance().geturlfilecontext(filename)
     if not context:
         return False
     if self.filetime == int(time.time()):
         time.sleep(1)
     self.filetime = int(time.time())
     newfilename = filename.replace(
         re.findall('\d+', filename)[-1], str(self.filetime))
     urlsfile = self.tempurlpath + newfilename
     context.filename = urlsfile
     URLFileManager.getinstance().updateurlfilecontext(
         FileUtility.getfilename(urlsfile), context)
     return urlsfile

コード例 #27

0

ファイルを表示

 def download(self):
     """
     平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download()
     step1:从平台上下载数据到本地./data/platform
     step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json
     """
     files = []
     Logger.getlogging().debug('Get Valid PUC File From ' +
                               self.download_path)
     #srclist = self.getvalidfiles(self.download_path)
     srclist = FileUtility.getfilelist(self.download_path,
                                       [])[0:self.maxfilenum]
     for donefile in srclist:
         try:
             if donefile.endswith('done'):
                 Logger.getlogging().info('MOVE {file} TO {path}'.format(
                     file=donefile, path=self.done_file))
                 FileUtility.move(donefile, self.done_file)
                 binfile = os.path.join(self.done_file,
                                        FileUtility.getfilename(donefile))
                 #FileUtility.copy(donefile, self.cache_path)
                 #binfile = self.cache_path+ FileUtility.getfilename(donefile)
                 #if FileUtility.getfilesize(donefile) == FileUtility.getfilesize(binfile):
                 ##备份当天的puc文件
                 #Logger.getlogging().info('MOVE {file} TO {path}'.format(file=donefile,path=self.pucbacktoday))
                 #FileUtility.move(donefile, self.pucbacktoday)
                 #if FileUtility.exists(donefile):
                 #Logger.getlogging().error('MOVE {file} failed'.format(file=donefile))
                 #else:
                 #Logger.getlogging().error('File not equal {file}'.format(file=donefile))
                 jsonfile = self.bin2json(binfile)
                 files.append(jsonfile)
                 try:
                     self.s3puc_dumpurls(jsonfile)
                     time.sleep(0.5)
                     Logger.getlogging().debug(
                         'Remove {f}'.format(f=jsonfile))
                     FileUtility.remove(jsonfile)
                     donefile2 = os.path.join(
                         self.done_file, FileUtility.getfilename(donefile))
                     Logger.getlogging().debug(
                         'Remove {f}'.format(f=donefile2))
                     FileUtility.remove(donefile2)
                 except:
                     Logger.printexception()
                     Logger.getlogging().error(
                         'no json file generate from done file:{done}'.
                         format(done=binfile))
                     os.mknod(jsonfile)
         except:
             Logger.printexception()
     return files

コード例 #28

0

ファイルを表示

 def __init__(self):
     self.url_beforenewsinfo_map = {SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_VOTENUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_FANSNUM: {}}
     self.url_beforenewsnum_map = {}
     self.url_curcmtcontent_map = {}
     self.url_curcmtnum_map = {}
     self.url_beforecmtnum_map = {}
     date = TimeUtility.getcurrentdate()
     path = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_PATH), date)     
     suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_FILENAME_SUFFIX)  
     self.outputpath = FileFormat.OUTPUTPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time()))
     self.errorinfopath = FileFormat.ERRORINFOPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time()))
     self.pushpath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_PUSH_PATH_MASTER), date)
     if not FileUtility.exists(path):
         FileUtility.mkdirs(path)

コード例 #29

0

ファイルを表示

 def bin2json(self, file):
     filename = FileUtility.getfilename(file).replace('.done', '.json')
     cmd = PUCDownloader.PARSE_COMMAND.format(command=self.parse_tool,
                                              input=file,
                                              output=self.json_path,
                                              filename=filename)
     self.execute(cmd)
     return self.json_path + filename

コード例 #30

0

ファイルを表示

ファイル: spiderdao.py プロジェクト: ErBingBing/django-tonado-crawler

 def getall(self):
     resdict = None
     if FileUtility.exists(self.dbfile):
         database = bsddb.btopen(self.dbfile, 'r')
         resdict = {}
         for key in database.keys():
             resdict[key] = database[key]
         database.close()
     return resdict