def readFile(urlpath, filename):
    whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN,
                                       const.SPIDER_POST_WHOAMI)
    donepath = SpiderConfigure.getconfig(
        const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.mkdirs(donepath)
    writeTmpfile = donepath + filename + '.tmp'
    now = str(time.time()).split('.')[0]
    writefile = donepath + filename + '.txt.' + now + '.done'
    if os.path.exists(writeTmpfile):
        os.remove(writeTmpfile)
    if os.path.exists(writefile):
        os.remove(writefile)
    Logger.getlogging().debug('post_done start:{f}'.format(f=writefile))
    with open(urlpath, 'r') as fp:
        lines = fp.readlines()
        os.mknod(writeTmpfile)
        for line in lines:
            jsonLine = json.loads(line)
            try:
                jsonStr = downPost(jsonLine)
                with open(writeTmpfile, 'a+') as filetemp:
                    filetemp.write(jsonStr + '\n')
                Logger.getlogging().debug(
                    '{url}:Post request sucessed'.format(url=jsonLine['url']))
            except:
                Logger.getlogging().warning(
                    '{url}:Post request failed'.format(url=jsonLine['url']))
                Logger.printexception()
    if os.path.exists(writeTmpfile):
        os.rename(writeTmpfile, writefile)
        Logger.getlogging().debug('post_done end:{f}'.format(f=writefile))
    FileUtility.remove(urlpath)
Example #2
0
 def download(self):
     files = []
     if self.completed():
         return files
     Logger.getlogging().debug(self.info.donepath)
     srclist = self.sshls(self.info.donepath)
     for donefile in srclist:
         donefile = donefile.strip()
         filename = FileUtility.getfilename(donefile)
         if donefile.endswith(
                 'done') and filename not in self.download_file_list:
             self.download_file_list.append(filename)
             for upfile in self.upload_file_list.keys():
                 if filename.startswith(upfile):
                     FileUtility.mkdirs(self.info.localdonepath)
                     self.sshdownload(donefile)
                     dfile = self.info.localdonepath + FileUtility.getfilename(
                         donefile)
                     if self.info.jsonpath:
                         dfile = self.bin2json(dfile)
                     files.append(dfile)
                     self.download_time = int(time.time())
                     self.upload_file_list.pop(upfile)
                     self.uploadfile_retranslist.pop(upfile)
                     if not FileUtility.exists(dfile):
                         Logger.getlogging().error(
                             'no json file generate from done file:{done}'.
                             format(done=dfile))
                     break
     return files
def scanning():
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    scanningPath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.removefiles(donepath)
    backupPath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_URL_BACKUP), TimeUtility.getcurrentdate())
    interval = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_INTERVAL)
    FileUtility.mkdirs(scanningPath)
    FileUtility.mkdirs(backupPath) 
    while True:
        Logger.getlogging().debug('scanning')
        flag = False
        for filename in os.listdir(scanningPath):
            try:
                urlfilepath = os.path.join(scanningPath, filename)
                backupfile  = os.path.join(backupPath, filename)
                if os.path.isfile(urlfilepath) and 'tmp' not in filename:
                    Logger.getlogging().info('Get url file:{file}'.format(file=filename))
                    FileUtility.copy(urlfilepath, backupfile)
                    download(urlfilepath)
                if not flag:
                    flag = True
            except:
                Logger.printexception()
        if not flag:
            Logger.getlogging().debug('scanning interval sleeping {interval}s'.format(interval=interval))
            time.sleep(int(interval))    
Example #4
0
 def __upload__(self, filepath):
     flag = True
     FileUtility.mkdirs(self.urlbackuppath)
     FileUtility.copy(filepath, self.urlbackuppath)
     self.upload_file_list[FileUtility.getfilename(filepath)] = []
     # if filepath.endswith(constant.POST_FILE_SUFFIX) or FileUtility.getfilelines(filepath) <= constant.REMOTE_DOWNLOADER_MIN_LINES:
     #     if self.limpls:
     #         if self.limplsindex >= len(self.limpls):
     #             self.limplsindex = 0
     #         flag = self.limpls[self.limplsindex].upload(filepath)
     #         self.limplsindex += 1
     if filepath.endswith(constant.WEBKIT_FILE_SUFFIX):
         if self.wimpls:
             if self.wimplsindoex >= len(self.wimpls):
                 self.wimplsindoex = 0
             self.wimpls[self.wimplsindoex].upload(filepath)
             self.wimplsindoex += 1
     elif self.impls:
         if self.implsindex >= len(self.impls):
             self.implsindex = 0
         flag = self.impls[self.implsindex].upload(filepath)
         self.implsindex += 1
     else:
         flag = False
         Logger.getlogging().warning('No taskid or download platform!')
     return flag
def download(urlfilepath):
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.mkdirs(donepath)  
    filename = os.path.basename(urlfilepath)
    writeTmpfile = os.path.join(donepath, filename+'.temp')
    writefile = os.path.join(donepath, filename + '.txt.' + str(int(time.time())) + '.done')
    if os.path.exists(writeTmpfile):
        os.remove(writeTmpfile)
    if os.path.exists(writefile):
        os.remove(writefile) 
    httpsflag = False
    if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS:
        readlines = FileUtility.readlines(urlfilepath)
        for line in readlines:
            if line.strip().startswith('https'):
                httpsflag = True
                break
    #创建空文件
    with open(writeTmpfile,'a+') as filetemp:
        filetemp.write('')    
    if urlfilepath.endswith(constant.WEBKIT_FILE_SUFFIX) or httpsflag:
        downWebkit(urlfilepath, writeTmpfile)
    elif urlfilepath.endswith(constant.POST_FILE_SUFFIX):
        downPost(urlfilepath, writeTmpfile)
    else:
        downGet(urlfilepath, writeTmpfile)
    if os.path.exists(writeTmpfile):
        os.rename(writeTmpfile, writefile)
        Logger.getlogging().debug('DoneFile Download Success: {f}'.format(f=writefile))
    FileUtility.remove(urlfilepath)       
 def upload(self, path):
     tencentplatform.postdownloader.PostDownloader.upload(self, path)
     filename = FileUtility.getfilename(path)
     FileUtility.mkdirs(self.download_path)
     FileUtility.copy(
         path,
         '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path,
                                                 filename=filename,
                                                 ts=int(time.time())))
     return True
Example #7
0
    def waibuetl(self):
        waibubackup = SpiderConfigure.getwaibubaup()
        if not FileUtility.exists(waibubackup):
            FileUtility.mkdirs(waibubackup)

        waibufile = self.etl.getqueryfromdb()
        if not FileUtility.exists(waibufile):
            Logger.getlogging().warning(
                '{waibufile} not generate!'.format(waibufile=waibufile))
            return

        outtime = 0
        self.wdownloader.upload(waibufile)
        continueflag = True
        while continueflag:
            downloadfiles = []
            while True:
                Logger.getlogging().info(
                    'sleeping {sec}s......'.format(sec=self.waitingperiod))
                #time.sleep(self.waitingperiod)
                outtime += self.waitingperiod
                if self.wdownloader.iscompleted():
                    continueflag = False
                    break
                try:
                    downloadfiles = self.wdownloader.download()
                    if downloadfiles:
                        break
                except:
                    Logger.printexception()
                if outtime >= self.waibutimeout:
                    Logger.getlogging().warning(
                        'Waibu Data Download Timeout! Spending {sec}s'.format(
                            sec=outtime))
                    continueflag = False
                    break
            for dfile in downloadfiles:
                starttime = TimeUtility.getcurrentdate(
                    TimeUtility.TIME_FORMAT_DEFAULT)
                self.etl.wb_analysis(dfile)
                #if FileUtility.exists(waibubackup+FileUtility.getfilename(dfile)):
                #FileUtility.remove(waibubackup+FileUtility.getfilename(dfile))
                FileUtility.move(dfile, waibubackup)
                logstring = 'PROCESSWAIBUFILE:\t{file}\t{start}\t{end}'.format(
                    file=FileUtility.getfilename(dfile),
                    start=starttime,
                    end=TimeUtility.getcurrentdate())
                Logger.getlogging().info(logstring)
                if outtime >= self.waibutimeout:
                    Logger.getlogging().warning(
                        'Waibu Data Download Timeout! Spending {sec}s'.format(
                            sec=outtime))
                    continueflag = False
                    break
 def upload(self, path):
     tencentplatform.tencentdownloader.TencentDownloader.upload(self, path)
     filename = FileUtility.getfilename(path)
     ts = int(time.time())
     FileUtility.mkdirs(self.download_path)
     Logger.getlogging(
     ).debug(path + '--->' + '{dir}/{filename}.txt.{ts}.done'.format(
         dir=self.download_path, filename=filename, ts=int(time.time())))
     FileUtility.copy(
         path,
         '{dir}/{filename}.txt.{ts}.done'.format(dir=self.download_path,
                                                 filename=filename,
                                                 ts=int(time.time())))
     return True
Example #9
0
 def __init__(self, taskinfo=None, download_path=None):
     self.taskinfo = taskinfo
     self.maxfilenum = 100
     self.cache_path = Storage.getstoragelocation(
         const.SPIDER_DONE_TEMP_PATH)
     path = SpiderConfigure.getconfig(
         const.SPIDER_TENCENT_PLATFORM_DOMAIN,
         const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH)
     if download_path:
         self.download_path = download_path
     else:
         self.download_path = PUCDownloader.DOWNLOAD_PATH.format(
             path=path, taskid=self.taskinfo.taskid)
     self.parse_tool = SpiderConfigure.getconfig(
         const.SPIDER_TENCENT_PLATFORM_DOMAIN,
         const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG)
     #self.json_path = Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH)
     self.pucbackpath = SpiderConfigure.getconfig(
         const.SPIDER_STORAGE_DOMAIN,
         const.SPIDER_PUC_BACKUP_PATH) + self.taskinfo.taskid
     self.pucbacktoday = os.path.join(self.pucbackpath,
                                      TimeUtility.getcurrentdate())
     if not FileUtility.exists(self.pucbackpath):
         FileUtility.mkdirs(self.pucbackpath)
     if not FileUtility.exists(self.pucbacktoday):
         FileUtility.mkdirs(self.pucbacktoday)
     self.done_file = self.pucbacktoday + '/done/'
     self.json_path = self.pucbacktoday + '/json/'
     if not FileUtility.exists(self.done_file):
         FileUtility.mkdirs(self.done_file)
     if not FileUtility.exists(self.json_path):
         FileUtility.mkdirs(self.json_path)
     self.pucsavedays = 0
     self.clear()
Example #10
0
 def __init__(self):
     self.url_beforenewsinfo_map = {SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_VOTENUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_FANSNUM: {}}
     self.url_beforenewsnum_map = {}
     self.url_curcmtcontent_map = {}
     self.url_curcmtnum_map = {}
     self.url_beforecmtnum_map = {}
     date = TimeUtility.getcurrentdate()
     path = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_PATH), date)     
     suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_FILENAME_SUFFIX)  
     self.outputpath = FileFormat.OUTPUTPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time()))
     self.errorinfopath = FileFormat.ERRORINFOPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time()))
     self.pushpath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_PUSH_PATH_MASTER), date)
     if not FileUtility.exists(path):
         FileUtility.mkdirs(path)    
Example #11
0
 def download(self):
     doneurl = TencentDownloader.DONE_FILE_URL.format(
         taskid=self.taskinfo.taskid)
     html = TencentDownloader.httpget(doneurl)
     if html:
         xparse = XPathUtility(html)
         for donefile in xparse.getlist(r'//tr/td[2]/a'):
             if donefile.endswith(
                     'done') and donefile not in self.downloadedfiles:
                 for upfile in self.upload_file_list:
                     if donefile.startswith(upfile):
                         FileUtility.mkdirs(self.download_path)
                         self.execute(
                             TencentDownloader.DOWNLOAD_COMMAND.format(
                                 taskid=self.taskinfo.taskid,
                                 filename=donefile))
                         FileUtility.move('./' + donefile,
                                          self.download_path)
                         break
                 self.downloadedfiles.append(donefile)
     return tencentdownloader.TencentDownloader.download(self)
def scanning():
    whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN,
                                       const.SPIDER_POST_WHOAMI)
    scanningPath = SpiderConfigure.getconfig(
        const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH)
    backupPath = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN,
                                           const.DOWNLOADER_URL_BACKUP)

    FileUtility.mkdirs(scanningPath)
    FileUtility.mkdirs(backupPath)
    flag = False
    for filename in os.listdir(scanningPath):
        fp = os.path.join(scanningPath, filename)
        backupfile = os.path.join(backupPath, filename)
        if os.path.isfile(fp) and 'tmp' not in filename:
            Logger.getlogging().info(
                'Get url file:{file}'.format(file=filename))
            FileUtility.move(fp, backupfile)
            readFile(backupfile, filename)
        if not flag:
            flag = True
    if not flag:
        time.sleep(10)
Example #13
0
 def mkcachedir():
     cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY)
     FileUtility.rmdir(cache)
     FileUtility.mkdirs(cache)
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_WAIBU_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_TIEBA_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_URLS_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_DONE_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_OUTPUT_TEMP_PATH))
   
     limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT))
     outputpath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH)
     if FileUtility.exists(outputpath):
         validdate = TimeUtility.getuniformdatebefore(limit)
         for s in os.listdir(outputpath):
             if s < validdate:
                 fullpath = os.path.join(outputpath, s)
                 FileUtility.rmdir(fullpath)
 def __init__(self):
     self.downloader = SchedulDownloader()
     self.urlpath = SpiderConfigure.getconfig(const.SPIDER_SCHEDULER_DOMAIN,
                                              const.SCHEDULER_URL_PATH)
     FileUtility.mkdirs(self.urlpath)