def download(urlfilepath):
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.mkdirs(donepath)  
    filename = os.path.basename(urlfilepath)
    writeTmpfile = os.path.join(donepath, filename+'.temp')
    writefile = os.path.join(donepath, filename + '.txt.' + str(int(time.time())) + '.done')
    if os.path.exists(writeTmpfile):
        os.remove(writeTmpfile)
    if os.path.exists(writefile):
        os.remove(writefile) 
    httpsflag = False
    if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS:
        readlines = FileUtility.readlines(urlfilepath)
        for line in readlines:
            if line.strip().startswith('https'):
                httpsflag = True
                break
    #创建空文件
    with open(writeTmpfile,'a+') as filetemp:
        filetemp.write('')    
    if urlfilepath.endswith(constant.WEBKIT_FILE_SUFFIX) or httpsflag:
        downWebkit(urlfilepath, writeTmpfile)
    elif urlfilepath.endswith(constant.POST_FILE_SUFFIX):
        downPost(urlfilepath, writeTmpfile)
    else:
        downGet(urlfilepath, writeTmpfile)
    if os.path.exists(writeTmpfile):
        os.rename(writeTmpfile, writefile)
        Logger.getlogging().debug('DoneFile Download Success: {f}'.format(f=writefile))
    FileUtility.remove(urlfilepath)       
def readFile(urlpath, filename):
    whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN,
                                       const.SPIDER_POST_WHOAMI)
    donepath = SpiderConfigure.getconfig(
        const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.mkdirs(donepath)
    writeTmpfile = donepath + filename + '.tmp'
    now = str(time.time()).split('.')[0]
    writefile = donepath + filename + '.txt.' + now + '.done'
    if os.path.exists(writeTmpfile):
        os.remove(writeTmpfile)
    if os.path.exists(writefile):
        os.remove(writefile)
    Logger.getlogging().debug('post_done start:{f}'.format(f=writefile))
    with open(urlpath, 'r') as fp:
        lines = fp.readlines()
        os.mknod(writeTmpfile)
        for line in lines:
            jsonLine = json.loads(line)
            try:
                jsonStr = downPost(jsonLine)
                with open(writeTmpfile, 'a+') as filetemp:
                    filetemp.write(jsonStr + '\n')
                Logger.getlogging().debug(
                    '{url}:Post request sucessed'.format(url=jsonLine['url']))
            except:
                Logger.getlogging().warning(
                    '{url}:Post request failed'.format(url=jsonLine['url']))
                Logger.printexception()
    if os.path.exists(writeTmpfile):
        os.rename(writeTmpfile, writefile)
        Logger.getlogging().debug('post_done end:{f}'.format(f=writefile))
    FileUtility.remove(urlpath)
    def flush():
        # dump s1 download failed url
        SpiderConfigure.getinstance().setchannel(constant.SPIDER_CHANNEL_S1)
        SpiderConfigure.getinstance().setquery('')
        for url in SpiderReport.getinstance().s1urls:
            Logger.log(url, constant.ERRORCODE_FAIL_LOAD_DOWN)
        # dump none url got from website for query
        querynositemap = {}
        for query in SpiderReport.getinstance().querysitesmap.keys():
            querynositemap[query] = 0
            for site in SpiderReport.getinstance().querysitesmap[query]:
                SpiderReport.s2queryurl(query, site, None, True)
                querynositemap[query] += 1
#
        for query in SpiderReport.getinstance().querysitesmap.keys():
            if query in querynositemap:
                SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum,
                                        SpiderReport.getinstance().s2sitenum - querynositemap[query], True)
            else:
                SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum,
                                        SpiderReport.getinstance().s2sitenum, True)
#
        # report
        filename = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                             const.SPIDER_INFO_REPORT_FILE).format(
            date=TimeUtility.getcurrentdate())
        FileUtility.remove(filename)
        FileUtility.writeline(filename, SpiderReport.REPORT_FORMAT.format(
            ch='CHANNEL',
            query='QUERY',
            type='TYPE',
            v1='UPLOAD',
            v2='DOWNLOAD',
            v3='NO_TEMPLATE',
            v4='NO_SITE',
            v5='WITH_CMT',
            v6='FAILED'
        ))
        for key in SpiderReport.getinstance().reportlist.keys():
            for type in SpiderReport.getinstance().reportlist[key].keys():
                r = SpiderReport.getinstance().reportlist[key][type]
                FileUtility.writeline(filename, r.tostring())
        for key in SpiderReport.getinstance().s2sitereportlist.keys():
            for type in SpiderReport.getinstance().s2sitereportlist[key].keys():
                r = SpiderReport.getinstance().s2sitereportlist[key][type]
                FileUtility.writeline(filename, r.tostring())
        FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring())
        FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring2())
        FileUtility.flush()
        threshold = float(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                                    const.SPIDER_FAILED_THRESHOLD))
        rate = SpiderReport.getinstance().totalreport.getsuccess()
        if rate < threshold:
            Logger.getlogging().warning('success rate is lower than threshold')
            param = NotifyParam()
            param.code = NotifyParam.SPIDER_NOTIFY_OVER_FAILED
            param.message = 'success rate {rate} is lower than threshold {th}'.format(rate=Common.float2percent(rate),
                                                                                      th=Common.float2percent(
                                                                                          threshold))
            SpiderNotify.notify(param)
Beispiel #4
0
 def download(self):
     """
     平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download()
     step1:从平台上下载数据到本地./data/platform
     step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json
     """
     files = []
     Logger.getlogging().debug('Get Valid PUC File From ' +
                               self.download_path)
     #srclist = self.getvalidfiles(self.download_path)
     srclist = FileUtility.getfilelist(self.download_path,
                                       [])[0:self.maxfilenum]
     for donefile in srclist:
         try:
             if donefile.endswith('done'):
                 Logger.getlogging().info('MOVE {file} TO {path}'.format(
                     file=donefile, path=self.done_file))
                 FileUtility.move(donefile, self.done_file)
                 binfile = os.path.join(self.done_file,
                                        FileUtility.getfilename(donefile))
                 #FileUtility.copy(donefile, self.cache_path)
                 #binfile = self.cache_path+ FileUtility.getfilename(donefile)
                 #if FileUtility.getfilesize(donefile) == FileUtility.getfilesize(binfile):
                 ##备份当天的puc文件
                 #Logger.getlogging().info('MOVE {file} TO {path}'.format(file=donefile,path=self.pucbacktoday))
                 #FileUtility.move(donefile, self.pucbacktoday)
                 #if FileUtility.exists(donefile):
                 #Logger.getlogging().error('MOVE {file} failed'.format(file=donefile))
                 #else:
                 #Logger.getlogging().error('File not equal {file}'.format(file=donefile))
                 jsonfile = self.bin2json(binfile)
                 files.append(jsonfile)
                 try:
                     self.s3puc_dumpurls(jsonfile)
                     time.sleep(0.5)
                     Logger.getlogging().debug(
                         'Remove {f}'.format(f=jsonfile))
                     FileUtility.remove(jsonfile)
                     donefile2 = os.path.join(
                         self.done_file, FileUtility.getfilename(donefile))
                     Logger.getlogging().debug(
                         'Remove {f}'.format(f=donefile2))
                     FileUtility.remove(donefile2)
                 except:
                     Logger.printexception()
                     Logger.getlogging().error(
                         'no json file generate from done file:{done}'.
                         format(done=binfile))
                     os.mknod(jsonfile)
         except:
             Logger.printexception()
     return files
 def __init__(self):
     self.reportlist = {}
     self.s2sitereportlist = {}
     self.s2urlfilepath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                                    const.SPIDER_S2_QUERY_URLS_FILE).format(
         date=TimeUtility.getcurrentdate())
     FileUtility.remove(self.s2urlfilepath)
     self.totalreport = Report()
     self.totalreport.channel = 'SUM'
     self.s1urls = []
     self.querysitesmap = {}
     self.s2sitenum = 0
     self.s2urlsitemap = {}
Beispiel #6
0
 def upload(self, upfiles):
     Logger.getlogging().debug('uploading ......')
     for file in upfiles:
         if self.emptyfile(file):
             Logger.getlogging().info('remove empty file: ' + file)
             FileUtility.remove(file)
             continue
         if not self.__upload__(file):
             Logger.log(FileUtility.getfilename(file),
                        constant.ERRORCODE_FAIL_LOAD_UP)
             return False
         Logger.getlogging().info('remove uploadedfile' + file)
         FileUtility.remove(file)
     time.sleep(1)
     return True
Beispiel #7
0
 def findmax(self):
     filelist = FileUtility.getfilelist(self.pucbackpath, [])
     tf = {}
     for f in filelist:
         t = int(re.findall('(\d+)', f)[-1])
         tf[t] = f
     if not tf:
         return 0
     tm = max(tf.keys())
     for f in filelist:
         t = int(re.findall('(\d+)', f)[-1])
         if t < tm:
             Logger.getlogging().info('REMOVE {file}'.format(file=f))
             FileUtility.remove(f)
     return tm
 def download(self):
     """
     平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download()
     step1:从平台上下载数据到本地./data/platform
     step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json
     """
     files = []
     if self.completed():
         return files
     Logger.getlogging().debug(self.download_path)
     srclist = FileUtility.getfilelist(self.download_path, [])
     for donefile in srclist:
         filename = FileUtility.getfilename(donefile)
         if donefile.endswith(
                 'done') and filename not in self.download_file_list:
             self.download_file_list.append(filename)
             self.download_time = time.time()
             for upfile in self.upload_file_list.keys():
                 if filename.startswith(upfile):
                     FileUtility.copy(donefile, self.cache_path)
                     binfile = self.cache_path + FileUtility.getfilename(
                         donefile)
                     if FileUtility.getfilesize(
                             donefile) == FileUtility.getfilesize(binfile):
                         Logger.getlogging().info(
                             'Remove {file}'.format(file=donefile))
                         FileUtility.remove(donefile)
                         if FileUtility.exists(donefile):
                             Logger.getlogging().error(
                                 'Remove {file} failed'.format(
                                     file=donefile))
                     else:
                         Logger.getlogging().error(
                             'File not equal {file}'.format(file=donefile))
                     jsonfile = self.bin2json(binfile)
                     files.append(jsonfile)
                     uploadtime = self.uploadfile_retranslist[
                         upfile].start_time
                     if RegexUtility.match(
                             TencentDownloader.DOWNLOAD_FORMAT1.format(
                                 file=upfile), filename):
                         self.upload_file_list.pop(upfile)
                         self.uploadfile_retranslist.pop(upfile)
                     elif RegexUtility.match(
                             TencentDownloader.DOWNLOAD_FORMAT2.format(
                                 file=upfile), filename):
                         value = \
                         RegexUtility.parse(TencentDownloader.DOWNLOAD_FORMAT2.format(file=upfile), filename)[0]
                         if value[0] == value[1]:
                             self.upload_file_list.pop(upfile)
                             self.uploadfile_retranslist.pop(upfile)
                     if not FileUtility.exists(jsonfile):
                         Logger.getlogging().error(
                             'no json file generate from done file:{done}'.
                             format(done=binfile))
                         os.mknod(jsonfile)
                     # update upload time
                     keys = self.sortkeys()
                     for fl in keys:
                         if self.uploadfile_retranslist[
                                 fl].start_time >= uploadtime:
                             self.uploadfile_retranslist[
                                 fl].start_time = time.time()
                             time.sleep(0.1)
                     break
     return files
Beispiel #9
0
 def show(self):
     diffinfolist = {}
     predict = self.database.getall()
     instances = URLStorage.getinstances()
     Logger.getlogging().info(
         '##############################################################################################'
     )
     Logger.getlogging().info(
         '%8s|%8s|%8s|%8s|%8s|%8s|%8s|%20s|%16s' %
         ('key', 'flag', 'cmtnum', 'clicknum', 'votenum', 'fansnum',
          'realnum', 'pubtime', 'timestamp'))
     for ins in instances.keys():
         diffinfolist[ins] = DiffInfomation()
         if ins != constant.SPIDER_CHANNEL_S1:
             diffinfolist[ins].channel = constant.SPIDER_CHANNEL_S2
             diffinfolist[ins].query = ins
         for key in instances[ins].urlinfodict:
             if instances[ins].urlinfodict[key].realnum > 0:
                 StatisticsManager.updategotcomments(1)
             elif instances[ins].urlinfodict[key].cmtnum > 0:
                 StatisticsManager.updatefailgotcomment(1)
             if predict and key in predict:
                 info = URLCommentInfo.fromstring(predict[key])
                 if not instances[ins].urlinfodict[key].isequal(info):
                     self.printinfo(ins, info, '-')
                     self.printinfo(ins, instances[ins].urlinfodict[key],
                                    '+')
                     if instances[ins].urlinfodict[key].cmtnum > 0:
                         diffinfolist[ins].deltacmt += self.diff(
                             instances[ins].urlinfodict[key].cmtnum,
                             info.cmtnum)
                     else:
                         diffinfolist[ins].deltacmt += self.diff(
                             instances[ins].urlinfodict[key].realnum,
                             info.realnum)
                     diffinfolist[ins].deltaclick += self.diff(
                         instances[ins].urlinfodict[key].clicknum,
                         info.clicknum)
                     diffinfolist[ins].deltavote += self.diff(
                         instances[ins].urlinfodict[key].votenum,
                         info.votenum)
                     diffinfolist[ins].deltafans += self.diff(
                         instances[ins].urlinfodict[key].fansnum,
                         info.fansnum)
             else:
                 self.printinfo(ins, instances[ins].urlinfodict[key], '+')
                 if instances[ins].urlinfodict[key].cmtnum > 0:
                     diffinfolist[ins].deltacmt += instances[
                         ins].urlinfodict[key].cmtnum
                 else:
                     diffinfolist[ins].deltacmt += max(
                         0, instances[ins].urlinfodict[key].realnum)
                 diffinfolist[ins].deltaclick += max(
                     0, instances[ins].urlinfodict[key].clicknum)
                 diffinfolist[ins].deltavote += max(
                     0, instances[ins].urlinfodict[key].votenum)
                 diffinfolist[ins].deltafans += max(
                     0, instances[ins].urlinfodict[key].fansnum)
     Logger.getlogging().info(
         '##############################################################################################'
     )
     if FileUtility.exists(self.difffile):
         FileUtility.remove(self.difffile)
     for key in diffinfolist.keys():
         Logger.getlogging().info(diffinfolist[key].tostring())
         FileUtility.writeline(self.difffile, diffinfolist[key].tostring())