Exemple #1
0
 def __init__(self):
     # 使用该URL识别回传S2查询结果的类,推荐使用主站URL
     self.fakeoriginalurl = 'http://query.website.com/'
     self.querylastdays = int(SpiderConfigure.getinstance().getlastdays())
     self.website = self
     self.maxpages = int(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                                   const.SPIDER_S2_MAX_QUERY_PAGES))
Exemple #2
0
 def __init__(self, taskinfo=None, download_path=None):
     self.taskinfo = taskinfo
     self.maxfilenum = 100
     self.cache_path = Storage.getstoragelocation(
         const.SPIDER_DONE_TEMP_PATH)
     path = SpiderConfigure.getconfig(
         const.SPIDER_TENCENT_PLATFORM_DOMAIN,
         const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH)
     if download_path:
         self.download_path = download_path
     else:
         self.download_path = PUCDownloader.DOWNLOAD_PATH.format(
             path=path, taskid=self.taskinfo.taskid)
     self.parse_tool = SpiderConfigure.getconfig(
         const.SPIDER_TENCENT_PLATFORM_DOMAIN,
         const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG)
     #self.json_path = Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH)
     self.pucbackpath = SpiderConfigure.getconfig(
         const.SPIDER_STORAGE_DOMAIN,
         const.SPIDER_PUC_BACKUP_PATH) + self.taskinfo.taskid
     self.pucbacktoday = os.path.join(self.pucbackpath,
                                      TimeUtility.getcurrentdate())
     if not FileUtility.exists(self.pucbackpath):
         FileUtility.mkdirs(self.pucbackpath)
     if not FileUtility.exists(self.pucbacktoday):
         FileUtility.mkdirs(self.pucbacktoday)
     self.done_file = self.pucbacktoday + '/done/'
     self.json_path = self.pucbacktoday + '/json/'
     if not FileUtility.exists(self.done_file):
         FileUtility.mkdirs(self.done_file)
     if not FileUtility.exists(self.json_path):
         FileUtility.mkdirs(self.json_path)
     self.pucsavedays = 0
     self.clear()
def readFile(urlpath, filename):
    whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN,
                                       const.SPIDER_POST_WHOAMI)
    donepath = SpiderConfigure.getconfig(
        const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.mkdirs(donepath)
    writeTmpfile = donepath + filename + '.tmp'
    now = str(time.time()).split('.')[0]
    writefile = donepath + filename + '.txt.' + now + '.done'
    if os.path.exists(writeTmpfile):
        os.remove(writeTmpfile)
    if os.path.exists(writefile):
        os.remove(writefile)
    Logger.getlogging().debug('post_done start:{f}'.format(f=writefile))
    with open(urlpath, 'r') as fp:
        lines = fp.readlines()
        os.mknod(writeTmpfile)
        for line in lines:
            jsonLine = json.loads(line)
            try:
                jsonStr = downPost(jsonLine)
                with open(writeTmpfile, 'a+') as filetemp:
                    filetemp.write(jsonStr + '\n')
                Logger.getlogging().debug(
                    '{url}:Post request sucessed'.format(url=jsonLine['url']))
            except:
                Logger.getlogging().warning(
                    '{url}:Post request failed'.format(url=jsonLine['url']))
                Logger.printexception()
    if os.path.exists(writeTmpfile):
        os.rename(writeTmpfile, writefile)
        Logger.getlogging().debug('post_done end:{f}'.format(f=writefile))
    FileUtility.remove(urlpath)
Exemple #4
0
 def __init__(self):
     self.factory = SiteFactory()
     self.conf = SpiderConfigure.getinstance()
     self.urlbackuppath = SpiderConfigure.getconfig(
         const.SPIDER_STORAGE_DOMAIN,
         const.SPIDER_URL_BACKUP_PATH) + TimeUtility.getcurrentdate()
     self.period = int(SpiderConfigure.getinstance().getlastdays())
Exemple #5
0
 def copyfiles(self):
     # s1/s2输入路径
     s1file = SpiderConfigure.getinstance().gets1file()
     s2file = SpiderConfigure.getinstance().gets2file()
     # s1/s2历史路径
     self.conf.setchannel(SPIDER_CHANNEL_S1)
     s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH)
     if FileUtility.exists(s1file):
         lines = 0
         firstline = True
         with open(s1file, 'r') as fp:
             for line in fp.readlines():
                 line = line.strip()
                 if firstline:
                     firstline = False
                     if line[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file))
                         line = line[3:]
                 if line:
                     lines += 1
                     SpiderReport.puts1url(line)
         if lines > 0:
             FileUtility.copy(s1file, s1tempfile)
             SpiderReport.update(SPIDER_CHANNEL_S1, '', SpiderReport.URL_UPLOAD, lines)
     if FileUtility.exists(s2file):
         FileUtility.copy(s2file, s2temppath)
def download(urlfilepath):
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.mkdirs(donepath)  
    filename = os.path.basename(urlfilepath)
    writeTmpfile = os.path.join(donepath, filename+'.temp')
    writefile = os.path.join(donepath, filename + '.txt.' + str(int(time.time())) + '.done')
    if os.path.exists(writeTmpfile):
        os.remove(writeTmpfile)
    if os.path.exists(writefile):
        os.remove(writefile) 
    httpsflag = False
    if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS:
        readlines = FileUtility.readlines(urlfilepath)
        for line in readlines:
            if line.strip().startswith('https'):
                httpsflag = True
                break
    #创建空文件
    with open(writeTmpfile,'a+') as filetemp:
        filetemp.write('')    
    if urlfilepath.endswith(constant.WEBKIT_FILE_SUFFIX) or httpsflag:
        downWebkit(urlfilepath, writeTmpfile)
    elif urlfilepath.endswith(constant.POST_FILE_SUFFIX):
        downPost(urlfilepath, writeTmpfile)
    else:
        downGet(urlfilepath, writeTmpfile)
    if os.path.exists(writeTmpfile):
        os.rename(writeTmpfile, writefile)
        Logger.getlogging().debug('DoneFile Download Success: {f}'.format(f=writefile))
    FileUtility.remove(urlfilepath)       
def scanning():
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    scanningPath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.removefiles(donepath)
    backupPath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_URL_BACKUP), TimeUtility.getcurrentdate())
    interval = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_INTERVAL)
    FileUtility.mkdirs(scanningPath)
    FileUtility.mkdirs(backupPath) 
    while True:
        Logger.getlogging().debug('scanning')
        flag = False
        for filename in os.listdir(scanningPath):
            try:
                urlfilepath = os.path.join(scanningPath, filename)
                backupfile  = os.path.join(backupPath, filename)
                if os.path.isfile(urlfilepath) and 'tmp' not in filename:
                    Logger.getlogging().info('Get url file:{file}'.format(file=filename))
                    FileUtility.copy(urlfilepath, backupfile)
                    download(urlfilepath)
                if not flag:
                    flag = True
            except:
                Logger.printexception()
        if not flag:
            Logger.getlogging().debug('scanning interval sleeping {interval}s'.format(interval=interval))
            time.sleep(int(interval))    
 def __init__(self):
     self.id = ''
     # query
     self.query = SpiderConfigure.getinstance().getquery()
     # 渠道
     self.channel = SpiderConfigure.getinstance().getchannel()
     # 类型
     self.type = ''
     # URL
     self.url = ''
     # 标题
     self.title = ''
     # 正文 / 主贴
     self.body = ''
     # 评论(内容) / 回复(内容)
     # 评论量
     self.cmtnum = -1
     # 阅读量 / 播放量 增量
     self.clicknum = -1
     # 点赞量
     self.votenum = -1
     # 粉丝量 / 订阅量
     self.fansnum = -1
     # 发布时间
     self.pubtime = TimeUtility.getintformtime(0)
     # createtime
     self.createtime = SpiderConfigure.getinstance().starttime()
 def getid(url):
     idformat = '{machine}_{query}_{url}_{starttime}'
     id = idformat.format(
         machine=NewsStorage.LOCALMACHINEFLAG,
         query=Common.urlenc(SpiderConfigure.getinstance().getquery()),
         url=Common.urlenc(url),
         starttime=SpiderConfigure.getinstance().starttime())
     return Common.md5(id)
 def getinstance():
     channel = SpiderConfigure.getinstance().getchannel()
     if channel == constant.SPIDER_CHANNEL_S2:
         key = SpiderConfigure().getquery()
     else:
         key = channel
     if key not in URLManager.__instancemap:
         URLManager.__instancemap[key] = URLManager()
     return URLManager.__instancemap[key]
Exemple #11
0
 def __init__(self):
     self.database = SpiderDao()
     suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                        const.SPIDER_OUTPUT_FILENAME_SUFFIX)
     ts = TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT)
     self.difffile = '{path}/{dt}/{file}'.format(
         path=SpiderConfigure.getinstance().getconfig(
             const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH),
         dt=TimeUtility.getcurrentdate(),
         file=DiffController.DIFF_FILE_NAME_FORMAT.format(suffix=suffix,
                                                          ts=ts))
Exemple #12
0
 def storetiebaquery(self,
                     query,
                     queryurl,
                     machineflaglist=MACHINEFLAGLIST_TIEBA):
     #查询query是否存在,如果存在则更新当前updatetime
     #                  如果不存在则查找具有query数量最小的机器,进行query存储
     query = query.strip()
     queryurl = queryurl.strip()
     result = QueryStorage.find(query,
                                machineflaglist,
                                table=SQLDAO.SPIDER_TABLE_QUERYS_TIEBA)
     if result:
         resultdict = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_QUERYS_KEYS,
                                         result)
         machine = resultdict[SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG]
         id = QueryStorage.getid(query, machine)
         SQLDAO.getinstance().update(
             SQLDAO.SPIDER_TABLE_QUERYS_TIEBA,
             {SQLDAO.SPIDER_TABLE_QUERYS_ID: id}, {
                 SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE:
                 SpiderConfigure.getinstance().starttime(),
                 SQLDAO.SPIDER_TABLE_QUERYS_VALID:
                 1
             })
     else:
         machine = min(self.querystorage_tieba.iteritems(),
                       key=lambda x: x[1])[0]
         data = {
             SQLDAO.SPIDER_TABLE_QUERYS_ID:
             QueryStorage.getid(query, machine),
             SQLDAO.SPIDER_TABLE_QUERYS_QUERY:
             query,
             SQLDAO.SPIDER_TABLE_QUERYS_CREATEDATE:
             SpiderConfigure.getinstance().starttime(),
             SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE:
             SpiderConfigure.getinstance().starttime(),
             SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG:
             machine,
             SQLDAO.SPIDER_TABLE_QUERYS_QUERYURL:
             queryurl,
             SQLDAO.SPIDER_TABLE_QUERYS_VALID:
             1
         }
         SQLDAO.getinstance().insert(
             SQLDAO.SPIDER_TABLE_QUERYS_TIEBA,
             SQLDAO.SPIDER_TABLE_QUERYS_KEYS,
             SQLDAO.getvaluesfromkeys(data,
                                      SQLDAO.SPIDER_TABLE_QUERYS_KEYS))
     #对各machine的实时存储记录
     self.querystorage_tieba[machine] = self.querystorage_tieba.get(
         machine, 0) + 1
Exemple #13
0
 def __init__(self):
     # 下载平台
     SQLDAO.getinstance()
     self.downloader = Downloader()
     self.wdownloader = WDownloader()
     # ETL controller
     self.etl = ETLController()
     self.waitingperiod = int(
         SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                   const.SPIDER_WAITING_PERIOD))
     self.timeout = int(2 * int(
         SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                   const.SPIDER_WAIT_PLATFORM_TIMEOUT)))
     self.spiderstarttime = int(time.time())
     self.waibutimeout = 2 * 60 * 60
Exemple #14
0
 def process(self, params):
     # S2 Query Process
     if SPIDER_CHANNEL_S2 == SpiderConfigure.getinstance().getchannel():
         if SPIDER_S2_WEBSITE_TYPE not in params.customized:
             return True
     xparser = XPathUtility(params.content)
     maxitmes = 0
     pageinfo = PageBasicInfo()
     template = None
     for template in TemplateManager.getxpaths(params.url):
         Logger.getlogging().debug('URL_TEMPLATE {url}\t{template}'.format(
             url=params.url,
             template=template[TemplateManager.XPATH_KEY_URL_TEMPLATE]))
         pageinfo, items = self.parsefromcontent(params, template, xparser)
         if constant.SPIDER_S2_WEBSITE_TYPE in params.customized:
             pageinfo.type = params.customized[
                 constant.SPIDER_S2_WEBSITE_TYPE]
     #if not params.page_title and not pageinfo.title and not params.lastretry:
     #return False
     if template is None:
         Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_TEMPLATE)
     #值覆盖
     pageinfo.url = params.url
     if not pageinfo.title:
         pageinfo.title = params.page_title
     if not pageinfo.body:
         pageinfo.body = params.page_body
     if not pageinfo.pubtime:
         pageinfo.pubtime = params.html_time
     NewsStorage.seturlinfos(pageinfo)
Exemple #15
0
 def initwaibu(self):
     for dl in SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             const.SPIDER_TENCENT_PLATFORM_WBTASK_LIST).split(','):
         info = WaibiDownloaderInfo()
         info.taskname = SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             dl + constant.SPIDER_WBTASK_NAME)
         info.token = SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             dl + constant.SPIDER_WBTASK_TOKEN)
         info.appid = SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             dl + constant.SPIDER_WBTASK_APPID)
         self.wbimpls[WaibiDownloader(info)] = ''
         self.tasknamelist[info.taskname] = ''
Exemple #16
0
 def s2query(self):
     self.conf.setchannel(SPIDER_CHANNEL_S2)
     s2file = SpiderConfigure.getinstance().gets2file()
     file = FileUtility.getfilename(s2file)
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) + file
     if FileUtility.exists(s2temppath):
         with open(s2temppath, 'r') as fp:
             querylist = []
             firstline = True
             for strquery in fp.readlines():
                 if firstline:
                     firstline = False
                     if strquery[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file))
                         strquery = strquery[3:]
                 strquery = Common.strip(strquery)
                 if not strquery:
                     continue
                 Logger.getlogging().info('S2 {query} start...'.format(query=strquery))
                 self.conf.setquery(strquery)
                 URLStorage.updaterecycle()
                 querylist.append(strquery)
                 for site in self.factory.getall():
                     site.s2query(strquery.replace('&', ' '))
             sitelist = []
             for site in self.factory.getall():
                 if site.exists2():
                     sitelist.append(site)
             SpiderReport.loadquery(querylist)
             SpiderReport.loadsites(sitelist)
Exemple #17
0
 def initwebkit(self):
     for task in SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             const.SPIDER_TENCENT_PLATFORM_WEBKIT_TASK_LIST).split(','):
         taskinfo = TaskInfo()
         task = task.strip()
         taskinfo.taskid = SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             task + constant.SPIDER_TASKID)
         taskinfo.taskname = SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             task + constant.SPIDER_TASKNAME)
         taskinfo.userid = SpiderConfigure.getconfig(
             const.SPIDER_TENCENT_PLATFORM_DOMAIN,
             task + constant.SPIDER_USERID)
         self.wimpls.append(TencentDownloader(taskinfo))
 def storecmt(url, content, pubdate, user):
     content = Common.strfilter(content)
     user = Common.strfilter(user)
     pubdate = TimeUtility.getuniformtime(pubdate)
     if not CMTStorage.exist(url, content, pubdate, user):
         Logger.getlogging().debug(
             'url:{url}, content:{content}, pubdate:{pubdate}, user:{user}'.
             format(url=url, content=content, pubdate=pubdate, user=user))
         id = CMTStorage.getid(url, content, pubdate, user)
         data = {
             SQLDAO.SPIDER_TABLE_COMMENTS_ID:
             id,
             SQLDAO.SPIDER_TABLE_COMMENTS_URL:
             url,
             SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE:
             pubdate,
             SQLDAO.SPIDER_TABLE_COMMENTS_USER:
             user,
             SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT:
             content,
             SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE:
             SpiderConfigure.getinstance().starttime()
         }
         SQLDAO.getinstance().insert(
             SQLDAO.SPIDER_TABLE_COMMENTS,
             SQLDAO.SPIDER_TABLE_COMMENTS_KEYS,
             SQLDAO.getvaluesfromkeys(data,
                                      SQLDAO.SPIDER_TABLE_COMMENTS_KEYS))
 def show():
     u'{channel}\t{query}\t{cmtnum}\t{clicknum}\t{fansnum}\t{votenum}\t{publishdate}\t{createdate}\t{url}'
     Logger.getlogging().debug(
         'Now, Results Extract From Database Showing: ')
     Logger.getlogging().debug(
         u'channel\tquery\tcmtnum\tclicknum\tfansnum\tvotenum\tpublishdate\tcreatedate\turl'
     )
     alldata = SQLDAO.getinstance().find(
         SQLDAO.SPIDER_TABLE_NEWS, {
             SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE:
             SpiderConfigure.getinstance().starttime()
         })
     for data in alldata:
         dictdata = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, data)
         string = NewsStorage.NEWS_FORMAT.format(
             channel=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL],
             query=dictdata[SQLDAO.SPIDER_TABLE_NEWS_QUERY],
             cmtnum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM],
             clicknum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM],
             fansnum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM],
             votenum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM],
             publishdate=dictdata[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE],
             createdate=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE],
             url=dictdata[SQLDAO.SPIDER_TABLE_NEWS_URL])
         Logger.getlogging().debug(string)
Exemple #20
0
 def getfilename(self, url):
     # 渠道
     self.channel = SpiderConfigure.getinstance().getchannel()
     # S2查询信息
     self.query = SpiderConfigure.getinstance().getquery()
     # S2页面类型
     self.type = SpiderConfigure.getinstance().gettype()
     if self.channel == SPIDER_CHANNEL_S2:
         q = Common.md5(self.query)
     else:
         q = self.query
     return Storage.SPIDER_STORE_FILENAME_FORMAT.format(
         path = self.cache_path,
         date = TimeUtility.getcurrentdate(),
         channel = self.channel,
         query = q,
         filename = Common.md5(url))
 def createdatabase(self):
     self.createcollection(self.SPIDER_COLLECTION_NEWS,
                           self.SPIDER_COLLECTION_NEWS_INDEX)
     self.createcollection(self.SPIDER_COLLECTION_CHANNEL,
                           self.SPIDER_COLLECTION_CHANNEL_INDEX)
     self.createcollection(self.SPIDER_COLLECTION_WEBSITE,
                           self.SPIDER_COLLECTION_WEBSITE_INDEX)
     self.createcollection(self.SPIDER_COLLECTION_COMMENTS,
                           self.SPIDER_COLLECTION_COMMENTS_INDEX)
     self.createcollection(self.SPIDER_COLLECTION_IMAGE,
                           self.SPIDER_COLLECTION_IMAGE_ID_INDEX)
     jsonfile = SpiderConfigure.getconfig(
         const.SPIDER_DATABASE_DOMAIN, const.SPIDER_DATABASE_CHANNEL_CONFIG)
     self.loadfile(self.SPIDER_COLLECTION_CHANNEL, jsonfile)
     jsonfile = SpiderConfigure.getconfig(
         const.SPIDER_DATABASE_DOMAIN, const.SPIDER_DATABASE_WEBSITE_CONFIG)
     self.loadfile(self.SPIDER_COLLECTION_WEBSITE, jsonfile)
Exemple #22
0
 def __init__(self):
     self.url_beforenewsinfo_map = {SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_VOTENUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_FANSNUM: {}}
     self.url_beforenewsnum_map = {}
     self.url_curcmtcontent_map = {}
     self.url_curcmtnum_map = {}
     self.url_beforecmtnum_map = {}
     date = TimeUtility.getcurrentdate()
     path = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_PATH), date)     
     suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_FILENAME_SUFFIX)  
     self.outputpath = FileFormat.OUTPUTPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time()))
     self.errorinfopath = FileFormat.ERRORINFOPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time()))
     self.pushpath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_PUSH_PATH_MASTER), date)
     if not FileUtility.exists(path):
         FileUtility.mkdirs(path)    
Exemple #23
0
 def initlocal(self):
     """"""
     for dl in SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN,
             const.SPIDER_LOCAL_DOWNLOADER_LIST).split(','):
         info = LocalDownloaderInfo()
         dl = dl.strip()
         info.ip = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN,
                                             dl + constant.DOWNLOADER_IP)
         info.port = int(
             SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN,
                                       dl + constant.DOWNLOADER_PORT))
         info.username = SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_USERNAME)
         info.password = SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_PASSWORD)
         info.urlpath = SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_URL_PATH)
         info.donepath = SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_DONE_PATH)
         info.localdonepath = Storage.getstoragelocation(
             const.SPIDER_DONE_TEMP_PATH)
         info.jsonpath = Storage.getstoragelocation(
             const.SPIDER_JSON_TEMP_PATH)
         self.limpls.append(LocalDownloader(info))
Exemple #24
0
 def localetl(self):
     s1file = SpiderConfigure.getinstance().gets1file()
     self.etl.s1upload(s1file)
     s2file = self.etl.getqueryfromdb()
     self.etl.s2upload(s2file)
     s3file = self.etl.gettiebaqueryfromdb()
     self.etl.s3upload(s3file)
     self.upload()
     self.loop()
Exemple #25
0
 def getqueryfromdb(self):
     #指定s2 query输出文件路径
     s2file = SpiderConfigure.getinstance().gets2file()
     temppath = Storage.getstoragelocation(
         const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file)
     QueryStorage.getinstance().getlocalquerys(
         temppath, ETLController.LOCALMACHINEFLAG)
     if FileUtility.exists(temppath):
         return temppath
Exemple #26
0
 def generateurlfilepath(self, retrytimes=0):
     context = URLFileContext()
     context.channel = SpiderConfigure.getinstance().getchannel()
     context.query = SpiderConfigure.getinstance().getquery()
     context.retry = retrytimes
     # 防止生成相同的URL文件,等待1秒后重新获取时间戳
     if self.urlfiletimestamp == int(time.time()):
         time.sleep(1)
     self.urlfiletimestamp = int(time.time())
     self.urlsfile = URLFileManager.URLS_FILE_PATTERN.format(
         path=self.tempurldir,
         channel=context.channel,
         query=Common.md5(context.query),
         ts=self.urlfiletimestamp)
     context.filename = self.urlsfile
     self.urlsfilemap[FileUtility.getfilename(self.urlsfile)] = context
     Logger.getlogging().info(self.urlsfile)
     return self.urlsfile
 def upload(self):
     upfiles = FileUtility.getfilelist(
         SpiderConfigure.getconfig(const.SPIDER_SCHEDULER_DOMAIN,
                                   const.SCHEDULER_URL_PATH), [])
     donefiles = [
         dfile for dfile in upfiles
         if dfile.endswith(constant.POST_FILE_SUFFIX)
     ]
     return self.downloader.upload(donefiles)
Exemple #28
0
 def removecachefile():
     cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY)
     databackupfolder = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                                    const.SPIDER_DATA_BACKUP_PATH) + TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT)
     if FileUtility.exists(cache):
         FileUtility.move(cache, databackupfolder)
         FileUtility.rmdir(cache)
     limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT))
     databackuppath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH)
     if FileUtility.exists(databackuppath):
         validdate = TimeUtility.getdatebefore(limit, '%Y%m%d000000')
         for s in os.listdir(databackuppath):
             fullpath = os.path.join(databackuppath, s)
             #Logger.getlogging().info('remove cach folder ' + fullpath)
             #FileUtility.rmdir(fullpath)
             if s < validdate:
                 fullpath = os.path.join(databackuppath, s)
                 Logger.getlogging().info('remove cach folder ' + fullpath)
                 FileUtility.rmdir(fullpath)
Exemple #29
0
 def gettiebaqueryfromdb(self):
     #指定s2 query输出文件路径
     tiebafile = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                           const.SPIDER_S3_INPUT_FILE)
     temppath = Storage.getstoragelocation(
         const.SPIDER_TIEBA_TEMP_PATH) + FileUtility.getfilename(tiebafile)
     QueryStorage.getinstance().getlocalquerys_tieba(
         temppath, ETLController.LOCALMACHINEFLAG)
     if FileUtility.exists(temppath):
         return temppath
 def __init__(self):
     """
     # @functions:__init__
     # @param: none
     # @return:none
     # @note:mongodao类的构造器,初始化内部变量
     """
     self.ip = SpiderConfigure.getconfig(const.SPIDER_DATABASE_DOMAIN,
                                         const.SPIDER_DATABASE_IP)
     self.port = int(
         SpiderConfigure.getconfig(const.SPIDER_DATABASE_DOMAIN,
                                   const.SPIDER_DATABASE_PORT))
     self.database = SpiderConfigure.getconfig(
         const.SPIDER_DATABASE_DOMAIN, const.SPIDER_DATABASE_DATABASE)
     self.connected = False
     self.client = None
     self.retrytime = 0
     self.checktime = MongoDAO.gettime()
     self.createdatabase()