def flush():
        # dump s1 download failed url
        SpiderConfigure.getinstance().setchannel(constant.SPIDER_CHANNEL_S1)
        SpiderConfigure.getinstance().setquery('')
        for url in SpiderReport.getinstance().s1urls:
            Logger.log(url, constant.ERRORCODE_FAIL_LOAD_DOWN)
        # dump none url got from website for query
        querynositemap = {}
        for query in SpiderReport.getinstance().querysitesmap.keys():
            querynositemap[query] = 0
            for site in SpiderReport.getinstance().querysitesmap[query]:
                SpiderReport.s2queryurl(query, site, None, True)
                querynositemap[query] += 1
#
        for query in SpiderReport.getinstance().querysitesmap.keys():
            if query in querynositemap:
                SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum,
                                        SpiderReport.getinstance().s2sitenum - querynositemap[query], True)
            else:
                SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum,
                                        SpiderReport.getinstance().s2sitenum, True)
#
        # report
        filename = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                             const.SPIDER_INFO_REPORT_FILE).format(
            date=TimeUtility.getcurrentdate())
        FileUtility.remove(filename)
        FileUtility.writeline(filename, SpiderReport.REPORT_FORMAT.format(
            ch='CHANNEL',
            query='QUERY',
            type='TYPE',
            v1='UPLOAD',
            v2='DOWNLOAD',
            v3='NO_TEMPLATE',
            v4='NO_SITE',
            v5='WITH_CMT',
            v6='FAILED'
        ))
        for key in SpiderReport.getinstance().reportlist.keys():
            for type in SpiderReport.getinstance().reportlist[key].keys():
                r = SpiderReport.getinstance().reportlist[key][type]
                FileUtility.writeline(filename, r.tostring())
        for key in SpiderReport.getinstance().s2sitereportlist.keys():
            for type in SpiderReport.getinstance().s2sitereportlist[key].keys():
                r = SpiderReport.getinstance().s2sitereportlist[key][type]
                FileUtility.writeline(filename, r.tostring())
        FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring())
        FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring2())
        FileUtility.flush()
        threshold = float(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                                    const.SPIDER_FAILED_THRESHOLD))
        rate = SpiderReport.getinstance().totalreport.getsuccess()
        if rate < threshold:
            Logger.getlogging().warning('success rate is lower than threshold')
            param = NotifyParam()
            param.code = NotifyParam.SPIDER_NOTIFY_OVER_FAILED
            param.message = 'success rate {rate} is lower than threshold {th}'.format(rate=Common.float2percent(rate),
                                                                                      th=Common.float2percent(
                                                                                          threshold))
            SpiderNotify.notify(param)
def download(urlfilepath):
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.mkdirs(donepath)  
    filename = os.path.basename(urlfilepath)
    writeTmpfile = os.path.join(donepath, filename+'.temp')
    writefile = os.path.join(donepath, filename + '.txt.' + str(int(time.time())) + '.done')
    if os.path.exists(writeTmpfile):
        os.remove(writeTmpfile)
    if os.path.exists(writefile):
        os.remove(writefile) 
    httpsflag = False
    if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS:
        readlines = FileUtility.readlines(urlfilepath)
        for line in readlines:
            if line.strip().startswith('https'):
                httpsflag = True
                break
    #创建空文件
    with open(writeTmpfile,'a+') as filetemp:
        filetemp.write('')    
    if urlfilepath.endswith(constant.WEBKIT_FILE_SUFFIX) or httpsflag:
        downWebkit(urlfilepath, writeTmpfile)
    elif urlfilepath.endswith(constant.POST_FILE_SUFFIX):
        downPost(urlfilepath, writeTmpfile)
    else:
        downGet(urlfilepath, writeTmpfile)
    if os.path.exists(writeTmpfile):
        os.rename(writeTmpfile, writefile)
        Logger.getlogging().debug('DoneFile Download Success: {f}'.format(f=writefile))
    FileUtility.remove(urlfilepath)       
Exemple #3
0
 def __init__(self, taskinfo=None, download_path=None):
     self.taskinfo = taskinfo
     self.maxfilenum = 100
     self.cache_path = Storage.getstoragelocation(
         const.SPIDER_DONE_TEMP_PATH)
     path = SpiderConfigure.getconfig(
         const.SPIDER_TENCENT_PLATFORM_DOMAIN,
         const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH)
     if download_path:
         self.download_path = download_path
     else:
         self.download_path = PUCDownloader.DOWNLOAD_PATH.format(
             path=path, taskid=self.taskinfo.taskid)
     self.parse_tool = SpiderConfigure.getconfig(
         const.SPIDER_TENCENT_PLATFORM_DOMAIN,
         const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG)
     #self.json_path = Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH)
     self.pucbackpath = SpiderConfigure.getconfig(
         const.SPIDER_STORAGE_DOMAIN,
         const.SPIDER_PUC_BACKUP_PATH) + self.taskinfo.taskid
     self.pucbacktoday = os.path.join(self.pucbackpath,
                                      TimeUtility.getcurrentdate())
     if not FileUtility.exists(self.pucbackpath):
         FileUtility.mkdirs(self.pucbackpath)
     if not FileUtility.exists(self.pucbacktoday):
         FileUtility.mkdirs(self.pucbacktoday)
     self.done_file = self.pucbacktoday + '/done/'
     self.json_path = self.pucbacktoday + '/json/'
     if not FileUtility.exists(self.done_file):
         FileUtility.mkdirs(self.done_file)
     if not FileUtility.exists(self.json_path):
         FileUtility.mkdirs(self.json_path)
     self.pucsavedays = 0
     self.clear()
def scanning():
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    scanningPath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.removefiles(donepath)
    backupPath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_URL_BACKUP), TimeUtility.getcurrentdate())
    interval = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_INTERVAL)
    FileUtility.mkdirs(scanningPath)
    FileUtility.mkdirs(backupPath) 
    while True:
        Logger.getlogging().debug('scanning')
        flag = False
        for filename in os.listdir(scanningPath):
            try:
                urlfilepath = os.path.join(scanningPath, filename)
                backupfile  = os.path.join(backupPath, filename)
                if os.path.isfile(urlfilepath) and 'tmp' not in filename:
                    Logger.getlogging().info('Get url file:{file}'.format(file=filename))
                    FileUtility.copy(urlfilepath, backupfile)
                    download(urlfilepath)
                if not flag:
                    flag = True
            except:
                Logger.printexception()
        if not flag:
            Logger.getlogging().debug('scanning interval sleeping {interval}s'.format(interval=interval))
            time.sleep(int(interval))    
def readFile(urlpath, filename):
    whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN,
                                       const.SPIDER_POST_WHOAMI)
    donepath = SpiderConfigure.getconfig(
        const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.mkdirs(donepath)
    writeTmpfile = donepath + filename + '.tmp'
    now = str(time.time()).split('.')[0]
    writefile = donepath + filename + '.txt.' + now + '.done'
    if os.path.exists(writeTmpfile):
        os.remove(writeTmpfile)
    if os.path.exists(writefile):
        os.remove(writefile)
    Logger.getlogging().debug('post_done start:{f}'.format(f=writefile))
    with open(urlpath, 'r') as fp:
        lines = fp.readlines()
        os.mknod(writeTmpfile)
        for line in lines:
            jsonLine = json.loads(line)
            try:
                jsonStr = downPost(jsonLine)
                with open(writeTmpfile, 'a+') as filetemp:
                    filetemp.write(jsonStr + '\n')
                Logger.getlogging().debug(
                    '{url}:Post request sucessed'.format(url=jsonLine['url']))
            except:
                Logger.getlogging().warning(
                    '{url}:Post request failed'.format(url=jsonLine['url']))
                Logger.printexception()
    if os.path.exists(writeTmpfile):
        os.rename(writeTmpfile, writefile)
        Logger.getlogging().debug('post_done end:{f}'.format(f=writefile))
    FileUtility.remove(urlpath)
Exemple #6
0
 def __init__(self):
     # 下载平台
     SQLDAO.getinstance()
     self.downloader = Downloader()
     self.wdownloader = WDownloader()
     # ETL controller
     self.etl = ETLController()
     self.waitingperiod = int(
         SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                   const.SPIDER_WAITING_PERIOD))
     self.timeout = int(2 * int(
         SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                   const.SPIDER_WAIT_PLATFORM_TIMEOUT)))
     self.spiderstarttime = int(time.time())
     self.waibutimeout = 2 * 60 * 60
Exemple #7
0
 def __init__(self):
     # 使用该URL识别回传S2查询结果的类,推荐使用主站URL
     self.fakeoriginalurl = 'http://query.website.com/'
     self.querylastdays = int(SpiderConfigure.getinstance().getlastdays())
     self.website = self
     self.maxpages = int(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                                   const.SPIDER_S2_MAX_QUERY_PAGES))
Exemple #8
0
 def initwebkit(self):
     for task in SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             const.SPIDER_TENCENT_PLATFORM_WEBKIT_TASK_LIST).split(','):
         taskinfo = TaskInfo()
         task = task.strip()
         taskinfo.taskid = SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             task + constant.SPIDER_TASKID)
         taskinfo.taskname = SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             task + constant.SPIDER_TASKNAME)
         taskinfo.userid = SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             task + constant.SPIDER_USERID)
         self.wimpls.append(TencentDownloader(taskinfo))
Exemple #9
0
 def __init__(self):
     self.factory = SiteFactory()
     self.conf = SpiderConfigure.getinstance()
     self.urlbackuppath = SpiderConfigure.getconfig(
         const.SPIDER_STORAGE_DOMAIN,
         const.SPIDER_URL_BACKUP_PATH) + TimeUtility.getcurrentdate()
     self.period = int(SpiderConfigure.getinstance().getlastdays())
Exemple #10
0
 def initwaibu(self):
     for dl in SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             const.SPIDER_TENCENT_PLATFORM_WBTASK_LIST).split(','):
         info = WaibiDownloaderInfo()
         info.taskname = SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             dl + constant.SPIDER_WBTASK_NAME)
         info.token = SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             dl + constant.SPIDER_WBTASK_TOKEN)
         info.appid = SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             dl + constant.SPIDER_WBTASK_APPID)
         self.wbimpls[WaibiDownloader(info)] = ''
         self.tasknamelist[info.taskname] = ''
 def createdatabase(self):
     self.createcollection(self.SPIDER_COLLECTION_NEWS,
                           self.SPIDER_COLLECTION_NEWS_INDEX)
     self.createcollection(self.SPIDER_COLLECTION_CHANNEL,
                           self.SPIDER_COLLECTION_CHANNEL_INDEX)
     self.createcollection(self.SPIDER_COLLECTION_WEBSITE,
                           self.SPIDER_COLLECTION_WEBSITE_INDEX)
     self.createcollection(self.SPIDER_COLLECTION_COMMENTS,
                           self.SPIDER_COLLECTION_COMMENTS_INDEX)
     self.createcollection(self.SPIDER_COLLECTION_IMAGE,
                           self.SPIDER_COLLECTION_IMAGE_ID_INDEX)
     jsonfile = SpiderConfigure.getconfig(
         const.SPIDER_DATABASE_DOMAIN, const.SPIDER_DATABASE_CHANNEL_CONFIG)
     self.loadfile(self.SPIDER_COLLECTION_CHANNEL, jsonfile)
     jsonfile = SpiderConfigure.getconfig(
         const.SPIDER_DATABASE_DOMAIN, const.SPIDER_DATABASE_WEBSITE_CONFIG)
     self.loadfile(self.SPIDER_COLLECTION_WEBSITE, jsonfile)
Exemple #12
0
 def __init__(self):
     self.url_beforenewsinfo_map = {SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_VOTENUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_FANSNUM: {}}
     self.url_beforenewsnum_map = {}
     self.url_curcmtcontent_map = {}
     self.url_curcmtnum_map = {}
     self.url_beforecmtnum_map = {}
     date = TimeUtility.getcurrentdate()
     path = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_PATH), date)     
     suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_FILENAME_SUFFIX)  
     self.outputpath = FileFormat.OUTPUTPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time()))
     self.errorinfopath = FileFormat.ERRORINFOPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time()))
     self.pushpath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_PUSH_PATH_MASTER), date)
     if not FileUtility.exists(path):
         FileUtility.mkdirs(path)    
Exemple #13
0
 def initlocal(self):
     """"""
     for dl in SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN,
             const.SPIDER_LOCAL_DOWNLOADER_LIST).split(','):
         info = LocalDownloaderInfo()
         dl = dl.strip()
         info.ip = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN,
                                             dl + constant.DOWNLOADER_IP)
         info.port = int(
             SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN,
                                       dl + constant.DOWNLOADER_PORT))
         info.username = SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_USERNAME)
         info.password = SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_PASSWORD)
         info.urlpath = SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_URL_PATH)
         info.donepath = SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_DONE_PATH)
         info.localdonepath = Storage.getstoragelocation(
             const.SPIDER_DONE_TEMP_PATH)
         info.jsonpath = Storage.getstoragelocation(
             const.SPIDER_JSON_TEMP_PATH)
         self.limpls.append(LocalDownloader(info))
 def upload(self):
     upfiles = FileUtility.getfilelist(
         SpiderConfigure.getconfig(const.SPIDER_SCHEDULER_DOMAIN,
                                   const.SCHEDULER_URL_PATH), [])
     donefiles = [
         dfile for dfile in upfiles
         if dfile.endswith(constant.POST_FILE_SUFFIX)
     ]
     return self.downloader.upload(donefiles)
Exemple #15
0
 def removecachefile():
     cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY)
     databackupfolder = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                                    const.SPIDER_DATA_BACKUP_PATH) + TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT)
     if FileUtility.exists(cache):
         FileUtility.move(cache, databackupfolder)
         FileUtility.rmdir(cache)
     limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT))
     databackuppath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH)
     if FileUtility.exists(databackuppath):
         validdate = TimeUtility.getdatebefore(limit, '%Y%m%d000000')
         for s in os.listdir(databackuppath):
             fullpath = os.path.join(databackuppath, s)
             #Logger.getlogging().info('remove cach folder ' + fullpath)
             #FileUtility.rmdir(fullpath)
             if s < validdate:
                 fullpath = os.path.join(databackuppath, s)
                 Logger.getlogging().info('remove cach folder ' + fullpath)
                 FileUtility.rmdir(fullpath)
 def __init__(self):
     """
     # @functions:__init__
     # @param: none
     # @return:none
     # @note:mongodao类的构造器,初始化内部变量
     """
     self.ip = SpiderConfigure.getconfig(const.SPIDER_DATABASE_DOMAIN,
                                         const.SPIDER_DATABASE_IP)
     self.port = int(
         SpiderConfigure.getconfig(const.SPIDER_DATABASE_DOMAIN,
                                   const.SPIDER_DATABASE_PORT))
     self.database = SpiderConfigure.getconfig(
         const.SPIDER_DATABASE_DOMAIN, const.SPIDER_DATABASE_DATABASE)
     self.connected = False
     self.client = None
     self.retrytime = 0
     self.checktime = MongoDAO.gettime()
     self.createdatabase()
Exemple #17
0
 def gettiebaqueryfromdb(self):
     #指定s2 query输出文件路径
     tiebafile = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                           const.SPIDER_S3_INPUT_FILE)
     temppath = Storage.getstoragelocation(
         const.SPIDER_TIEBA_TEMP_PATH) + FileUtility.getfilename(tiebafile)
     QueryStorage.getinstance().getlocalquerys_tieba(
         temppath, ETLController.LOCALMACHINEFLAG)
     if FileUtility.exists(temppath):
         return temppath
Exemple #18
0
 def mkcachedir():
     cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY)
     FileUtility.rmdir(cache)
     FileUtility.mkdirs(cache)
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_WAIBU_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_TIEBA_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_URLS_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_DONE_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_OUTPUT_TEMP_PATH))
   
     limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT))
     outputpath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH)
     if FileUtility.exists(outputpath):
         validdate = TimeUtility.getuniformdatebefore(limit)
         for s in os.listdir(outputpath):
             if s < validdate:
                 fullpath = os.path.join(outputpath, s)
                 FileUtility.rmdir(fullpath)
 def __init__(self, info):
     self.info = info
     self.upload_file_list = {}
     self.recycle_times = 0
     self.download_file_list = []
     #新添加的变量
     self.uploadfile_retranslist = {}
     self.outtimelimit = int(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_WAIT_PLATFORM_TIMEOUT))
     #self.outtimelimit = 30
     self.download_time = 0  
     self.taskstatusflag = True
Exemple #20
0
 def __init__(self):
     self.database = SpiderDao()
     suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                        const.SPIDER_OUTPUT_FILENAME_SUFFIX)
     ts = TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT)
     self.difffile = '{path}/{dt}/{file}'.format(
         path=SpiderConfigure.getinstance().getconfig(
             const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH),
         dt=TimeUtility.getcurrentdate(),
         file=DiffController.DIFF_FILE_NAME_FORMAT.format(suffix=suffix,
                                                          ts=ts))
def downPost(urlitem):
    interval = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN,
                                         const.DOWNLOADER_INTERVAL)
    firstTime = time.time()
    urlKey = ''
    formhash = ''
    postheaders = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
    }

    url = urlitem['url']
    data = urlitem['data']
    thisUrlKey = TemplateManager.getdomain(url)
    if thisUrlKey == urlKey:
        now = time.time()
        timedif = now - firstTime
        print timedif
        print interval
        if timedif < float(interval):
            time.sleep(float(interval) - timedif)
        if constant.SPIDER_POST_FORMHASH_VALUE in data:
            data = data.replace(constant.SPIDER_POST_FORMHASH_VALUE, formhash)
    else:
        cookie = createCookie(url)
        postheaders['Cookie'] = cookie
        if constant.SPIDER_POST_FORMHASH_VALUE in data:
            getRequest = urllib2.urlopen(
                urllib2.Request(url, headers=postheaders))
            formhash = H.document_fromstring(getRequest.read()).xpath(
                "// input[ @ name = 'formhash'] / @value")[0]
            data = data.replace(constant.SPIDER_POST_FORMHASH_VALUE, formhash)
    req = urllib2.Request(url, data, headers=postheaders)
    response = urllib2.urlopen(req, timeout=timeout)
    the_page = response.read()
    urlKey = thisUrlKey
    firstTime = time.time()
    saveJson = {}
    saveJson['html'] = Common.urlenc(the_page)
    saveJson['data'] = urlitem['data']
    saveJson['foundin'] = url
    saveJson['crawler_time'] = str(time.time()).split('.')[0]
    jsonStr = json.dumps(saveJson)
    return jsonStr
 def __init__(self):
     self.reportlist = {}
     self.s2sitereportlist = {}
     self.s2urlfilepath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                                    const.SPIDER_S2_QUERY_URLS_FILE).format(
         date=TimeUtility.getcurrentdate())
     FileUtility.remove(self.s2urlfilepath)
     self.totalreport = Report()
     self.totalreport.channel = 'SUM'
     self.s1urls = []
     self.querysitesmap = {}
     self.s2sitenum = 0
     self.s2urlsitemap = {}
def scanning():
    whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN,
                                       const.SPIDER_POST_WHOAMI)
    scanningPath = SpiderConfigure.getconfig(
        const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH)
    backupPath = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN,
                                           const.DOWNLOADER_URL_BACKUP)

    FileUtility.mkdirs(scanningPath)
    FileUtility.mkdirs(backupPath)
    flag = False
    for filename in os.listdir(scanningPath):
        fp = os.path.join(scanningPath, filename)
        backupfile = os.path.join(backupPath, filename)
        if os.path.isfile(fp) and 'tmp' not in filename:
            Logger.getlogging().info(
                'Get url file:{file}'.format(file=filename))
            FileUtility.move(fp, backupfile)
            readFile(backupfile, filename)
        if not flag:
            flag = True
    if not flag:
        time.sleep(10)
Exemple #24
0
    def storagequery(self):
        QueryStorage.updatedb()
        SpiderConfigure.getinstance().setchannel(SPIDER_CHANNEL_S2)
        s2file = SpiderConfigure.getinstance().gets2file()
        if FileUtility.exists(s2file):
            lines = FileUtility.readlines(s2file)
            for strquery in lines:
                QueryStorage.getinstance().storequery(strquery)
                QueryStorage.getinstance().storewaibuquery(strquery)

        tiebafile = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                              const.SPIDER_S3_INPUT_FILE)
        if FileUtility.exists(tiebafile):
            lines = FileUtility.readlines(tiebafile)
            for strquery in lines:
                if not self.checks3query(strquery):
                    continue
                query = strquery.split('\t')[0].strip()
                url = strquery.split('\t')[1].strip()
                QueryStorage.getinstance().storetiebaquery(query, url)
Exemple #25
0
 def __init__(self):
     self.upload_file_list = {}
     self.impls = []
     self.implsindex = 0
     self.initcommon()
     self.wimpls = []
     self.wimplsindoex = 0
     self.initwebkit()
     self.limpls = []
     self.limplsindex = 0
     self.initlocal()
     self.tempurlpath = Storage.getstoragelocation(
         const.SPIDER_URLS_TEMP_PATH)
     self.urlbackuppath = SpiderConfigure.getconfig(
         const.SPIDER_STORAGE_DOMAIN,
         const.SPIDER_URL_BACKUP_PATH) + TimeUtility.getcurrentdate()
     #文件下载失败重试机制
     self.retransmissionfiles = {}
     self.all_retransmissionfiles = {}
     self.retransmissionlimitnum = 3
     self.filetime = 0
Exemple #26
0
 def __init__(self):
     self.database = SpiderConfigure.getconfig(
         const.SPIDER_DATABASE_DOMAIN, const.SPIDER_DATABASE_DATABASE)
     self.ip = SpiderConfigure.getconfig(const.SPIDER_DATABASE_DOMAIN,
                                         const.SPIDER_DATABASE_IP)
     self.port = int(
         SpiderConfigure.getconfig(const.SPIDER_DATABASE_DOMAIN,
                                   const.SPIDER_DATABASE_PORT))
     self.password = SpiderConfigure.getconfig(
         const.SPIDER_DATABASE_DOMAIN, const.SPIDER_DATABASE_PASSWORD)
     self.user = SpiderConfigure.getconfig(const.SPIDER_DATABASE_DOMAIN,
                                           const.SPIDER_DATABASE_USERNAME)
     self.charset = SpiderConfigure.getconfig(const.SPIDER_DATABASE_DOMAIN,
                                              const.SPIDER_DATABASE_CHARSET)
     self.connected = False
     #self.connect = None
     self.retrytime = 0
     self.checktime = SQLDAO.gettime()
     self.createdatabase()
    def __init__(self, taskinfo):
        self.taskinfo = taskinfo
        self.upload_url = SpiderConfigure.getconfig(
            const.SPIDER_TENCENT_PLATFORM_DOMAIN,
            const.SPIDER_TENCENT_PLATFORM_UPLOAD_URL)
        self.cache_path = Storage.getstoragelocation(
            const.SPIDER_DONE_TEMP_PATH)
        path = SpiderConfigure.getconfig(
            const.SPIDER_TENCENT_PLATFORM_DOMAIN,
            const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH)
        self.download_path = TencentDownloader.DOWNLOAD_PATH.format(
            path=path, taskid=self.taskinfo.taskid)

        self.parse_tool = SpiderConfigure.getconfig(
            const.SPIDER_TENCENT_PLATFORM_DOMAIN,
            const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL)
        self.parse_tool_img = SpiderConfigure.getconfig(
            const.SPIDER_TENCENT_PLATFORM_DOMAIN,
            const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG)
        self.json_path = Storage.getstoragelocation(
            const.SPIDER_JSON_TEMP_PATH)
        self.upload_file_list = {}
        self.recycle_times = 0
        self.download_file_list = []
        self.download_file_list2 = []
        self.retrytimes = int(
            SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                      const.SPIDER_UPLOAD_RETRY_TIMES))
        # 新添加的变量
        self.uploadfile_retranslist = {}
        self.outtimelimit = int(
            SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                      const.SPIDER_WAIT_PLATFORM_TIMEOUT))
        # self.outtimelimit = 10
        self.download_time = time.time()
        self.taskstatusflag = True
        self.start_time = 0
Exemple #28
0
class QueryStorage:
    MACHINEFLIST = SpiderConfigure.getconfig(
        const.SPIDER_TENCENT_PLATFORM_DOMAIN,
        const.SPIDER_TENCENT_PLATFORM_MACHINE_LIST)
    MACHINEFLAGLIST = [
        item.replace('.', '') for item in MACHINEFLIST.split(',')
    ]

    MACHINEFLIST_WAIBU = SpiderConfigure.getconfig(
        const.SPIDER_TENCENT_PLATFORM_DOMAIN,
        const.SPIDER_TENCENT_PLATFORM_MACHINE_LIST_WAIBU)
    MACHINEFLAGLIST_WAIBU = [
        item.replace('.', '') for item in MACHINEFLIST_WAIBU.split(',')
    ]

    MACHINEFLIST_TIEBA = SpiderConfigure.getconfig(
        const.SPIDER_TENCENT_PLATFORM_DOMAIN,
        const.SPIDER_TENCENT_PLATFORM_MACHINE_LIST_TIEBA)
    MACHINEFLAGLIST_TIEBA = [
        item.replace('.', '') for item in MACHINEFLIST_TIEBA.split(',')
    ]

    LOCALMACHINEFLAG = SpiderConfigure.getinstance().localmachineflag()
    LOCALQUERYPATH = './data/temp/query/query.txt'
    __instance = None
    __querys = None
    __querys_tieba = None

    #----------------------------------------------------------------------
    def __init__(self):
        self.querystorage = {}
        self.querystorage_waibu = {}
        self.querystorage_tieba = {}
        for machine in QueryStorage.MACHINEFLAGLIST:
            self.querystorage[machine] = 0
        for machine in QueryStorage.MACHINEFLAGLIST_WAIBU:
            self.querystorage_waibu[machine] = 0
        for machine in QueryStorage.MACHINEFLAGLIST_TIEBA:
            self.querystorage_tieba[machine] = 0

    #----------------------------------------------------------------------
    @staticmethod
    def getinstance():
        if not QueryStorage.__instance:
            QueryStorage.__instance = QueryStorage()
        return QueryStorage.__instance

    #----------------------------------------------------------------------
    @staticmethod
    def updatedb():
        for matchine in QueryStorage.MACHINEFLAGLIST:
            SQLDAO.getinstance().update(
                SQLDAO.SPIDER_TABLE_QUERYS,
                where={
                    SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: matchine,
                    SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1
                },
                update={SQLDAO.SPIDER_TABLE_QUERYS_VALID: 0})
            SQLDAO.getinstance().update(
                SQLDAO.SPIDER_TABLE_QUERYS_TIEBA,
                where={
                    SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: matchine,
                    SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1
                },
                update={SQLDAO.SPIDER_TABLE_QUERYS_VALID: 0})
        for matchine in QueryStorage.MACHINEFLAGLIST_WAIBU:
            SQLDAO.getinstance().update(
                SQLDAO.SPIDER_TABLE_QUERYS,
                where={
                    SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: matchine,
                    SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1
                },
                update={SQLDAO.SPIDER_TABLE_QUERYS_VALID: 0})

    #----------------------------------------------------------------------
    def storequery(self, query, machineflaglist=MACHINEFLAGLIST):
        #查询query是否存在,如果存在则更新当前updatetime
        #                  如果不存在则查找具有query数量最小的机器,进行query存储
        query = query.strip()
        result = QueryStorage.find(query, machineflaglist)
        if result:
            resultdict = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_QUERYS_KEYS,
                                            result)
            machine = resultdict[SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG]
            id = QueryStorage.getid(query, machine)
            SQLDAO.getinstance().update(
                SQLDAO.SPIDER_TABLE_QUERYS,
                {SQLDAO.SPIDER_TABLE_QUERYS_ID: id}, {
                    SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE:
                    SpiderConfigure.getinstance().starttime(),
                    SQLDAO.SPIDER_TABLE_QUERYS_VALID:
                    1
                })
        else:
            machine = min(self.querystorage.iteritems(), key=lambda x: x[1])[0]
            data = {
                SQLDAO.SPIDER_TABLE_QUERYS_ID:
                QueryStorage.getid(query, machine),
                SQLDAO.SPIDER_TABLE_QUERYS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_QUERYS_CREATEDATE:
                SpiderConfigure.getinstance().starttime(),
                SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE:
                SpiderConfigure.getinstance().starttime(),
                SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG:
                machine,
                SQLDAO.SPIDER_TABLE_QUERYS_VALID:
                1
            }
            SQLDAO.getinstance().insert(
                SQLDAO.SPIDER_TABLE_QUERYS, SQLDAO.SPIDER_TABLE_QUERYS_KEYS,
                SQLDAO.getvaluesfromkeys(data,
                                         SQLDAO.SPIDER_TABLE_QUERYS_KEYS))
        #对各machine的实时存储记录
        self.querystorage[machine] = self.querystorage.get(machine, 0) + 1

    #----------------------------------------------------------------------
    def storewaibuquery(self, query, machineflaglist=MACHINEFLAGLIST_WAIBU):
        #查询query是否存在,如果存在则更新当前updatetime
        #                  如果不存在则查找具有query数量最小的机器,进行query存储
        query = query.strip()
        result = QueryStorage.find(query, machineflaglist)
        if result:
            resultdict = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_QUERYS_KEYS,
                                            result)
            machine = resultdict[SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG]
            id = QueryStorage.getid(query, machine)
            SQLDAO.getinstance().update(
                SQLDAO.SPIDER_TABLE_QUERYS,
                {SQLDAO.SPIDER_TABLE_QUERYS_ID: id}, {
                    SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE:
                    SpiderConfigure.getinstance().starttime(),
                    SQLDAO.SPIDER_TABLE_QUERYS_VALID:
                    1
                })
        else:
            machine = min(self.querystorage_waibu.iteritems(),
                          key=lambda x: x[1])[0]
            data = {
                SQLDAO.SPIDER_TABLE_QUERYS_ID:
                QueryStorage.getid(query, machine),
                SQLDAO.SPIDER_TABLE_QUERYS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_QUERYS_CREATEDATE:
                SpiderConfigure.getinstance().starttime(),
                SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE:
                SpiderConfigure.getinstance().starttime(),
                SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG:
                machine,
                SQLDAO.SPIDER_TABLE_QUERYS_VALID:
                1
            }
            SQLDAO.getinstance().insert(
                SQLDAO.SPIDER_TABLE_QUERYS, SQLDAO.SPIDER_TABLE_QUERYS_KEYS,
                SQLDAO.getvaluesfromkeys(data,
                                         SQLDAO.SPIDER_TABLE_QUERYS_KEYS))
        #对各machine的实时存储记录
        self.querystorage_waibu[machine] = self.querystorage_waibu.get(
            machine, 0) + 1

    #----------------------------------------------------------------------
    def storetiebaquery(self,
                        query,
                        queryurl,
                        machineflaglist=MACHINEFLAGLIST_TIEBA):
        #查询query是否存在,如果存在则更新当前updatetime
        #                  如果不存在则查找具有query数量最小的机器,进行query存储
        query = query.strip()
        queryurl = queryurl.strip()
        result = QueryStorage.find(query,
                                   machineflaglist,
                                   table=SQLDAO.SPIDER_TABLE_QUERYS_TIEBA)
        if result:
            resultdict = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_QUERYS_KEYS,
                                            result)
            machine = resultdict[SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG]
            id = QueryStorage.getid(query, machine)
            SQLDAO.getinstance().update(
                SQLDAO.SPIDER_TABLE_QUERYS_TIEBA,
                {SQLDAO.SPIDER_TABLE_QUERYS_ID: id}, {
                    SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE:
                    SpiderConfigure.getinstance().starttime(),
                    SQLDAO.SPIDER_TABLE_QUERYS_VALID:
                    1
                })
        else:
            machine = min(self.querystorage_tieba.iteritems(),
                          key=lambda x: x[1])[0]
            data = {
                SQLDAO.SPIDER_TABLE_QUERYS_ID:
                QueryStorage.getid(query, machine),
                SQLDAO.SPIDER_TABLE_QUERYS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_QUERYS_CREATEDATE:
                SpiderConfigure.getinstance().starttime(),
                SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE:
                SpiderConfigure.getinstance().starttime(),
                SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG:
                machine,
                SQLDAO.SPIDER_TABLE_QUERYS_QUERYURL:
                queryurl,
                SQLDAO.SPIDER_TABLE_QUERYS_VALID:
                1
            }
            SQLDAO.getinstance().insert(
                SQLDAO.SPIDER_TABLE_QUERYS_TIEBA,
                SQLDAO.SPIDER_TABLE_QUERYS_KEYS,
                SQLDAO.getvaluesfromkeys(data,
                                         SQLDAO.SPIDER_TABLE_QUERYS_KEYS))
        #对各machine的实时存储记录
        self.querystorage_tieba[machine] = self.querystorage_tieba.get(
            machine, 0) + 1

    #----------------------------------------------------------------------
    @staticmethod
    def find(query,
             machineflaglist=MACHINEFLAGLIST,
             table=SQLDAO.SPIDER_TABLE_QUERYS):
        wheref = '{querykey}=\"{query}\" and {machikey} in ({machine})'
        where = wheref.format(querykey=SQLDAO.SPIDER_TABLE_QUERYS_QUERY,
                              query=query,
                              machikey=SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG,
                              machine=','.join(machineflaglist))
        return SQLDAO.getinstance().find(table, where, multi=False)

    @staticmethod
    #----------------------------------------------------------------------
    def getid(query, machine):
        return Common.md5(Common.urlenc(query) + machine)

    #本地数据和外部数据的query
    #----------------------------------------------------------------------
    @staticmethod
    def dumplocalquerys(queryfile=LOCALQUERYPATH,
                        localmachine=LOCALMACHINEFLAG):
        #todaymid = time.mktime(time.strptime(TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT))
        where = {
            SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: localmachine,
            SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1
        }
        results = SQLDAO.getinstance().find(
            SQLDAO.SPIDER_TABLE_QUERYS,
            where,
            keys=[SQLDAO.SPIDER_TABLE_QUERYS_QUERY])
        querys = [''.join(item) for item in results]
        with open(queryfile, 'w+') as fp:
            fp.write('\n'.join(querys))
        return querys

    #----------------------------------------------------------------------
    @staticmethod
    def getlocalquerys(queryfile=LOCALQUERYPATH,
                       localmachine=LOCALMACHINEFLAG):
        if not QueryStorage.__querys:
            QueryStorage.__querys = QueryStorage.dumplocalquerys(
                queryfile, localmachine)
        return QueryStorage.__querys

    #百度贴吧
    #----------------------------------------------------------------------
    @staticmethod
    def dumplocalquerys_tieba(queryfile=LOCALQUERYPATH,
                              localmachine=LOCALMACHINEFLAG):
        where = {
            SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: localmachine,
            SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1
        }
        results = SQLDAO.getinstance().find(
            SQLDAO.SPIDER_TABLE_QUERYS_TIEBA,
            where,
            keys=[
                SQLDAO.SPIDER_TABLE_QUERYS_QUERY,
                SQLDAO.SPIDER_TABLE_QUERYS_QUERYURL
            ])
        querys = ['\t'.join(item) for item in results]
        with open(queryfile, 'w+') as fp:
            fp.write('\n'.join(querys))
        return querys

    #----------------------------------------------------------------------
    @staticmethod
    def getlocalquerys_tieba(queryfile=LOCALQUERYPATH,
                             localmachine=LOCALMACHINEFLAG):
        if not QueryStorage.__querys_tieba:
            QueryStorage.__querys_tieba = QueryStorage.dumplocalquerys_tieba(
                queryfile, localmachine)
        return QueryStorage.__querys_tieba
Exemple #29
0
 def getstoragelocation(subpath):
     cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY)
     return Storage.STORAGE_LOCATION_FORMAT.format(parent=cache, child=subpath)
 def __init__(self):
     self.dbfile = SpiderConfigure.getconfig(const.SPIDER_DB_DOMAIN,
                                             const.SPIDER_DB_FILEPATH)