def __init__(self): # 使用该URL识别回传S2查询结果的类,推荐使用主站URL self.fakeoriginalurl = 'http://query.website.com/' self.querylastdays = int(SpiderConfigure.getinstance().getlastdays()) self.website = self self.maxpages = int(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_S2_MAX_QUERY_PAGES))
def __init__(self, taskinfo=None, download_path=None): self.taskinfo = taskinfo self.maxfilenum = 100 self.cache_path = Storage.getstoragelocation( const.SPIDER_DONE_TEMP_PATH) path = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH) if download_path: self.download_path = download_path else: self.download_path = PUCDownloader.DOWNLOAD_PATH.format( path=path, taskid=self.taskinfo.taskid) self.parse_tool = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG) #self.json_path = Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH) self.pucbackpath = SpiderConfigure.getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_PUC_BACKUP_PATH) + self.taskinfo.taskid self.pucbacktoday = os.path.join(self.pucbackpath, TimeUtility.getcurrentdate()) if not FileUtility.exists(self.pucbackpath): FileUtility.mkdirs(self.pucbackpath) if not FileUtility.exists(self.pucbacktoday): FileUtility.mkdirs(self.pucbacktoday) self.done_file = self.pucbacktoday + '/done/' self.json_path = self.pucbacktoday + '/json/' if not FileUtility.exists(self.done_file): FileUtility.mkdirs(self.done_file) if not FileUtility.exists(self.json_path): FileUtility.mkdirs(self.json_path) self.pucsavedays = 0 self.clear()
def readFile(urlpath, filename): whoami = SpiderConfigure.getconfig(const.SPIDER_POST_DOMAIN, const.SPIDER_POST_WHOAMI) donepath = SpiderConfigure.getconfig( const.SPIDER_POST_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.mkdirs(donepath) writeTmpfile = donepath + filename + '.tmp' now = str(time.time()).split('.')[0] writefile = donepath + filename + '.txt.' + now + '.done' if os.path.exists(writeTmpfile): os.remove(writeTmpfile) if os.path.exists(writefile): os.remove(writefile) Logger.getlogging().debug('post_done start:{f}'.format(f=writefile)) with open(urlpath, 'r') as fp: lines = fp.readlines() os.mknod(writeTmpfile) for line in lines: jsonLine = json.loads(line) try: jsonStr = downPost(jsonLine) with open(writeTmpfile, 'a+') as filetemp: filetemp.write(jsonStr + '\n') Logger.getlogging().debug( '{url}:Post request sucessed'.format(url=jsonLine['url'])) except: Logger.getlogging().warning( '{url}:Post request failed'.format(url=jsonLine['url'])) Logger.printexception() if os.path.exists(writeTmpfile): os.rename(writeTmpfile, writefile) Logger.getlogging().debug('post_done end:{f}'.format(f=writefile)) FileUtility.remove(urlpath)
def __init__(self): self.factory = SiteFactory() self.conf = SpiderConfigure.getinstance() self.urlbackuppath = SpiderConfigure.getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_URL_BACKUP_PATH) + TimeUtility.getcurrentdate() self.period = int(SpiderConfigure.getinstance().getlastdays())
def copyfiles(self): # s1/s2输入路径 s1file = SpiderConfigure.getinstance().gets1file() s2file = SpiderConfigure.getinstance().gets2file() # s1/s2历史路径 self.conf.setchannel(SPIDER_CHANNEL_S1) s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) if FileUtility.exists(s1file): lines = 0 firstline = True with open(s1file, 'r') as fp: for line in fp.readlines(): line = line.strip() if firstline: firstline = False if line[:3] == codecs.BOM_UTF8: Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file)) line = line[3:] if line: lines += 1 SpiderReport.puts1url(line) if lines > 0: FileUtility.copy(s1file, s1tempfile) SpiderReport.update(SPIDER_CHANNEL_S1, '', SpiderReport.URL_UPLOAD, lines) if FileUtility.exists(s2file): FileUtility.copy(s2file, s2temppath)
def download(urlfilepath): whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI) donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.mkdirs(donepath) filename = os.path.basename(urlfilepath) writeTmpfile = os.path.join(donepath, filename+'.temp') writefile = os.path.join(donepath, filename + '.txt.' + str(int(time.time())) + '.done') if os.path.exists(writeTmpfile): os.remove(writeTmpfile) if os.path.exists(writefile): os.remove(writefile) httpsflag = False if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS: readlines = FileUtility.readlines(urlfilepath) for line in readlines: if line.strip().startswith('https'): httpsflag = True break #创建空文件 with open(writeTmpfile,'a+') as filetemp: filetemp.write('') if urlfilepath.endswith(constant.WEBKIT_FILE_SUFFIX) or httpsflag: downWebkit(urlfilepath, writeTmpfile) elif urlfilepath.endswith(constant.POST_FILE_SUFFIX): downPost(urlfilepath, writeTmpfile) else: downGet(urlfilepath, writeTmpfile) if os.path.exists(writeTmpfile): os.rename(writeTmpfile, writefile) Logger.getlogging().debug('DoneFile Download Success: {f}'.format(f=writefile)) FileUtility.remove(urlfilepath)
def scanning(): whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI) scanningPath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH) donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.removefiles(donepath) backupPath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_URL_BACKUP), TimeUtility.getcurrentdate()) interval = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_INTERVAL) FileUtility.mkdirs(scanningPath) FileUtility.mkdirs(backupPath) while True: Logger.getlogging().debug('scanning') flag = False for filename in os.listdir(scanningPath): try: urlfilepath = os.path.join(scanningPath, filename) backupfile = os.path.join(backupPath, filename) if os.path.isfile(urlfilepath) and 'tmp' not in filename: Logger.getlogging().info('Get url file:{file}'.format(file=filename)) FileUtility.copy(urlfilepath, backupfile) download(urlfilepath) if not flag: flag = True except: Logger.printexception() if not flag: Logger.getlogging().debug('scanning interval sleeping {interval}s'.format(interval=interval)) time.sleep(int(interval))
def __init__(self): self.id = '' # query self.query = SpiderConfigure.getinstance().getquery() # 渠道 self.channel = SpiderConfigure.getinstance().getchannel() # 类型 self.type = '' # URL self.url = '' # 标题 self.title = '' # 正文 / 主贴 self.body = '' # 评论(内容) / 回复(内容) # 评论量 self.cmtnum = -1 # 阅读量 / 播放量 增量 self.clicknum = -1 # 点赞量 self.votenum = -1 # 粉丝量 / 订阅量 self.fansnum = -1 # 发布时间 self.pubtime = TimeUtility.getintformtime(0) # createtime self.createtime = SpiderConfigure.getinstance().starttime()
def getid(url): idformat = '{machine}_{query}_{url}_{starttime}' id = idformat.format( machine=NewsStorage.LOCALMACHINEFLAG, query=Common.urlenc(SpiderConfigure.getinstance().getquery()), url=Common.urlenc(url), starttime=SpiderConfigure.getinstance().starttime()) return Common.md5(id)
def getinstance(): channel = SpiderConfigure.getinstance().getchannel() if channel == constant.SPIDER_CHANNEL_S2: key = SpiderConfigure().getquery() else: key = channel if key not in URLManager.__instancemap: URLManager.__instancemap[key] = URLManager() return URLManager.__instancemap[key]
def __init__(self): self.database = SpiderDao() suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_FILENAME_SUFFIX) ts = TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT) self.difffile = '{path}/{dt}/{file}'.format( path=SpiderConfigure.getinstance().getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH), dt=TimeUtility.getcurrentdate(), file=DiffController.DIFF_FILE_NAME_FORMAT.format(suffix=suffix, ts=ts))
def storetiebaquery(self, query, queryurl, machineflaglist=MACHINEFLAGLIST_TIEBA): #查询query是否存在,如果存在则更新当前updatetime # 如果不存在则查找具有query数量最小的机器,进行query存储 query = query.strip() queryurl = queryurl.strip() result = QueryStorage.find(query, machineflaglist, table=SQLDAO.SPIDER_TABLE_QUERYS_TIEBA) if result: resultdict = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_QUERYS_KEYS, result) machine = resultdict[SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG] id = QueryStorage.getid(query, machine) SQLDAO.getinstance().update( SQLDAO.SPIDER_TABLE_QUERYS_TIEBA, {SQLDAO.SPIDER_TABLE_QUERYS_ID: id}, { SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE: SpiderConfigure.getinstance().starttime(), SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1 }) else: machine = min(self.querystorage_tieba.iteritems(), key=lambda x: x[1])[0] data = { SQLDAO.SPIDER_TABLE_QUERYS_ID: QueryStorage.getid(query, machine), SQLDAO.SPIDER_TABLE_QUERYS_QUERY: query, SQLDAO.SPIDER_TABLE_QUERYS_CREATEDATE: SpiderConfigure.getinstance().starttime(), SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE: SpiderConfigure.getinstance().starttime(), SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: machine, SQLDAO.SPIDER_TABLE_QUERYS_QUERYURL: queryurl, SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1 } SQLDAO.getinstance().insert( SQLDAO.SPIDER_TABLE_QUERYS_TIEBA, SQLDAO.SPIDER_TABLE_QUERYS_KEYS, SQLDAO.getvaluesfromkeys(data, SQLDAO.SPIDER_TABLE_QUERYS_KEYS)) #对各machine的实时存储记录 self.querystorage_tieba[machine] = self.querystorage_tieba.get( machine, 0) + 1
def __init__(self): # 下载平台 SQLDAO.getinstance() self.downloader = Downloader() self.wdownloader = WDownloader() # ETL controller self.etl = ETLController() self.waitingperiod = int( SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_WAITING_PERIOD)) self.timeout = int(2 * int( SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_WAIT_PLATFORM_TIMEOUT))) self.spiderstarttime = int(time.time()) self.waibutimeout = 2 * 60 * 60
def process(self, params): # S2 Query Process if SPIDER_CHANNEL_S2 == SpiderConfigure.getinstance().getchannel(): if SPIDER_S2_WEBSITE_TYPE not in params.customized: return True xparser = XPathUtility(params.content) maxitmes = 0 pageinfo = PageBasicInfo() template = None for template in TemplateManager.getxpaths(params.url): Logger.getlogging().debug('URL_TEMPLATE {url}\t{template}'.format( url=params.url, template=template[TemplateManager.XPATH_KEY_URL_TEMPLATE])) pageinfo, items = self.parsefromcontent(params, template, xparser) if constant.SPIDER_S2_WEBSITE_TYPE in params.customized: pageinfo.type = params.customized[ constant.SPIDER_S2_WEBSITE_TYPE] #if not params.page_title and not pageinfo.title and not params.lastretry: #return False if template is None: Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_TEMPLATE) #值覆盖 pageinfo.url = params.url if not pageinfo.title: pageinfo.title = params.page_title if not pageinfo.body: pageinfo.body = params.page_body if not pageinfo.pubtime: pageinfo.pubtime = params.html_time NewsStorage.seturlinfos(pageinfo)
def initwaibu(self): for dl in SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_WBTASK_LIST).split(','): info = WaibiDownloaderInfo() info.taskname = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, dl + constant.SPIDER_WBTASK_NAME) info.token = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, dl + constant.SPIDER_WBTASK_TOKEN) info.appid = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, dl + constant.SPIDER_WBTASK_APPID) self.wbimpls[WaibiDownloader(info)] = '' self.tasknamelist[info.taskname] = ''
def s2query(self): self.conf.setchannel(SPIDER_CHANNEL_S2) s2file = SpiderConfigure.getinstance().gets2file() file = FileUtility.getfilename(s2file) s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) + file if FileUtility.exists(s2temppath): with open(s2temppath, 'r') as fp: querylist = [] firstline = True for strquery in fp.readlines(): if firstline: firstline = False if strquery[:3] == codecs.BOM_UTF8: Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file)) strquery = strquery[3:] strquery = Common.strip(strquery) if not strquery: continue Logger.getlogging().info('S2 {query} start...'.format(query=strquery)) self.conf.setquery(strquery) URLStorage.updaterecycle() querylist.append(strquery) for site in self.factory.getall(): site.s2query(strquery.replace('&', ' ')) sitelist = [] for site in self.factory.getall(): if site.exists2(): sitelist.append(site) SpiderReport.loadquery(querylist) SpiderReport.loadsites(sitelist)
def initwebkit(self): for task in SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_WEBKIT_TASK_LIST).split(','): taskinfo = TaskInfo() task = task.strip() taskinfo.taskid = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, task + constant.SPIDER_TASKID) taskinfo.taskname = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, task + constant.SPIDER_TASKNAME) taskinfo.userid = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, task + constant.SPIDER_USERID) self.wimpls.append(TencentDownloader(taskinfo))
def storecmt(url, content, pubdate, user): content = Common.strfilter(content) user = Common.strfilter(user) pubdate = TimeUtility.getuniformtime(pubdate) if not CMTStorage.exist(url, content, pubdate, user): Logger.getlogging().debug( 'url:{url}, content:{content}, pubdate:{pubdate}, user:{user}'. format(url=url, content=content, pubdate=pubdate, user=user)) id = CMTStorage.getid(url, content, pubdate, user) data = { SQLDAO.SPIDER_TABLE_COMMENTS_ID: id, SQLDAO.SPIDER_TABLE_COMMENTS_URL: url, SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE: pubdate, SQLDAO.SPIDER_TABLE_COMMENTS_USER: user, SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT: content, SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE: SpiderConfigure.getinstance().starttime() } SQLDAO.getinstance().insert( SQLDAO.SPIDER_TABLE_COMMENTS, SQLDAO.SPIDER_TABLE_COMMENTS_KEYS, SQLDAO.getvaluesfromkeys(data, SQLDAO.SPIDER_TABLE_COMMENTS_KEYS))
def show(): u'{channel}\t{query}\t{cmtnum}\t{clicknum}\t{fansnum}\t{votenum}\t{publishdate}\t{createdate}\t{url}' Logger.getlogging().debug( 'Now, Results Extract From Database Showing: ') Logger.getlogging().debug( u'channel\tquery\tcmtnum\tclicknum\tfansnum\tvotenum\tpublishdate\tcreatedate\turl' ) alldata = SQLDAO.getinstance().find( SQLDAO.SPIDER_TABLE_NEWS, { SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE: SpiderConfigure.getinstance().starttime() }) for data in alldata: dictdata = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, data) string = NewsStorage.NEWS_FORMAT.format( channel=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL], query=dictdata[SQLDAO.SPIDER_TABLE_NEWS_QUERY], cmtnum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM], clicknum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM], fansnum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM], votenum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM], publishdate=dictdata[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE], createdate=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE], url=dictdata[SQLDAO.SPIDER_TABLE_NEWS_URL]) Logger.getlogging().debug(string)
def getfilename(self, url): # 渠道 self.channel = SpiderConfigure.getinstance().getchannel() # S2查询信息 self.query = SpiderConfigure.getinstance().getquery() # S2页面类型 self.type = SpiderConfigure.getinstance().gettype() if self.channel == SPIDER_CHANNEL_S2: q = Common.md5(self.query) else: q = self.query return Storage.SPIDER_STORE_FILENAME_FORMAT.format( path = self.cache_path, date = TimeUtility.getcurrentdate(), channel = self.channel, query = q, filename = Common.md5(url))
def createdatabase(self): self.createcollection(self.SPIDER_COLLECTION_NEWS, self.SPIDER_COLLECTION_NEWS_INDEX) self.createcollection(self.SPIDER_COLLECTION_CHANNEL, self.SPIDER_COLLECTION_CHANNEL_INDEX) self.createcollection(self.SPIDER_COLLECTION_WEBSITE, self.SPIDER_COLLECTION_WEBSITE_INDEX) self.createcollection(self.SPIDER_COLLECTION_COMMENTS, self.SPIDER_COLLECTION_COMMENTS_INDEX) self.createcollection(self.SPIDER_COLLECTION_IMAGE, self.SPIDER_COLLECTION_IMAGE_ID_INDEX) jsonfile = SpiderConfigure.getconfig( const.SPIDER_DATABASE_DOMAIN, const.SPIDER_DATABASE_CHANNEL_CONFIG) self.loadfile(self.SPIDER_COLLECTION_CHANNEL, jsonfile) jsonfile = SpiderConfigure.getconfig( const.SPIDER_DATABASE_DOMAIN, const.SPIDER_DATABASE_WEBSITE_CONFIG) self.loadfile(self.SPIDER_COLLECTION_WEBSITE, jsonfile)
def __init__(self): self.url_beforenewsinfo_map = {SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: {}, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: {}, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM: {}, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM: {}} self.url_beforenewsnum_map = {} self.url_curcmtcontent_map = {} self.url_curcmtnum_map = {} self.url_beforecmtnum_map = {} date = TimeUtility.getcurrentdate() path = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_PATH), date) suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_FILENAME_SUFFIX) self.outputpath = FileFormat.OUTPUTPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time())) self.errorinfopath = FileFormat.ERRORINFOPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time())) self.pushpath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_PUSH_PATH_MASTER), date) if not FileUtility.exists(path): FileUtility.mkdirs(path)
def initlocal(self): """""" for dl in SpiderConfigure.getconfig( const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_DOWNLOADER_LIST).split(','): info = LocalDownloaderInfo() dl = dl.strip() info.ip = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_IP) info.port = int( SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_PORT)) info.username = SpiderConfigure.getconfig( const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_USERNAME) info.password = SpiderConfigure.getconfig( const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_PASSWORD) info.urlpath = SpiderConfigure.getconfig( const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_URL_PATH) info.donepath = SpiderConfigure.getconfig( const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_DONE_PATH) info.localdonepath = Storage.getstoragelocation( const.SPIDER_DONE_TEMP_PATH) info.jsonpath = Storage.getstoragelocation( const.SPIDER_JSON_TEMP_PATH) self.limpls.append(LocalDownloader(info))
def localetl(self): s1file = SpiderConfigure.getinstance().gets1file() self.etl.s1upload(s1file) s2file = self.etl.getqueryfromdb() self.etl.s2upload(s2file) s3file = self.etl.gettiebaqueryfromdb() self.etl.s3upload(s3file) self.upload() self.loop()
def getqueryfromdb(self): #指定s2 query输出文件路径 s2file = SpiderConfigure.getinstance().gets2file() temppath = Storage.getstoragelocation( const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file) QueryStorage.getinstance().getlocalquerys( temppath, ETLController.LOCALMACHINEFLAG) if FileUtility.exists(temppath): return temppath
def generateurlfilepath(self, retrytimes=0): context = URLFileContext() context.channel = SpiderConfigure.getinstance().getchannel() context.query = SpiderConfigure.getinstance().getquery() context.retry = retrytimes # 防止生成相同的URL文件,等待1秒后重新获取时间戳 if self.urlfiletimestamp == int(time.time()): time.sleep(1) self.urlfiletimestamp = int(time.time()) self.urlsfile = URLFileManager.URLS_FILE_PATTERN.format( path=self.tempurldir, channel=context.channel, query=Common.md5(context.query), ts=self.urlfiletimestamp) context.filename = self.urlsfile self.urlsfilemap[FileUtility.getfilename(self.urlsfile)] = context Logger.getlogging().info(self.urlsfile) return self.urlsfile
def upload(self): upfiles = FileUtility.getfilelist( SpiderConfigure.getconfig(const.SPIDER_SCHEDULER_DOMAIN, const.SCHEDULER_URL_PATH), []) donefiles = [ dfile for dfile in upfiles if dfile.endswith(constant.POST_FILE_SUFFIX) ] return self.downloader.upload(donefiles)
def removecachefile(): cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY) databackupfolder = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH) + TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT) if FileUtility.exists(cache): FileUtility.move(cache, databackupfolder) FileUtility.rmdir(cache) limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT)) databackuppath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH) if FileUtility.exists(databackuppath): validdate = TimeUtility.getdatebefore(limit, '%Y%m%d000000') for s in os.listdir(databackuppath): fullpath = os.path.join(databackuppath, s) #Logger.getlogging().info('remove cach folder ' + fullpath) #FileUtility.rmdir(fullpath) if s < validdate: fullpath = os.path.join(databackuppath, s) Logger.getlogging().info('remove cach folder ' + fullpath) FileUtility.rmdir(fullpath)
def gettiebaqueryfromdb(self): #指定s2 query输出文件路径 tiebafile = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_S3_INPUT_FILE) temppath = Storage.getstoragelocation( const.SPIDER_TIEBA_TEMP_PATH) + FileUtility.getfilename(tiebafile) QueryStorage.getinstance().getlocalquerys_tieba( temppath, ETLController.LOCALMACHINEFLAG) if FileUtility.exists(temppath): return temppath
def __init__(self): """ # @functions:__init__ # @param: none # @return:none # @note:mongodao类的构造器,初始化内部变量 """ self.ip = SpiderConfigure.getconfig(const.SPIDER_DATABASE_DOMAIN, const.SPIDER_DATABASE_IP) self.port = int( SpiderConfigure.getconfig(const.SPIDER_DATABASE_DOMAIN, const.SPIDER_DATABASE_PORT)) self.database = SpiderConfigure.getconfig( const.SPIDER_DATABASE_DOMAIN, const.SPIDER_DATABASE_DATABASE) self.connected = False self.client = None self.retrytime = 0 self.checktime = MongoDAO.gettime() self.createdatabase()