def initlocal(self): """""" for dl in SpiderConfigure.getconfig( const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_DOWNLOADER_LIST).split(','): info = LocalDownloaderInfo() dl = dl.strip() info.ip = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_IP) info.port = int( SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_PORT)) info.username = SpiderConfigure.getconfig( const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_USERNAME) info.password = SpiderConfigure.getconfig( const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_PASSWORD) info.urlpath = SpiderConfigure.getconfig( const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_URL_PATH) info.donepath = SpiderConfigure.getconfig( const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_DONE_PATH) info.localdonepath = Storage.getstoragelocation( const.SPIDER_DONE_TEMP_PATH) info.jsonpath = Storage.getstoragelocation( const.SPIDER_JSON_TEMP_PATH) self.limpls.append(LocalDownloader(info))
def copyfiles(self): # s1/s2输入路径 s1file = SpiderConfigure.getinstance().gets1file() s2file = SpiderConfigure.getinstance().gets2file() # s1/s2历史路径 self.conf.setchannel(SPIDER_CHANNEL_S1) s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) if FileUtility.exists(s1file): lines = 0 firstline = True with open(s1file, 'r') as fp: for line in fp.readlines(): line = line.strip() if firstline: firstline = False if line[:3] == codecs.BOM_UTF8: Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file)) line = line[3:] if line: lines += 1 SpiderReport.puts1url(line) if lines > 0: FileUtility.copy(s1file, s1tempfile) SpiderReport.update(SPIDER_CHANNEL_S1, '', SpiderReport.URL_UPLOAD, lines) if FileUtility.exists(s2file): FileUtility.copy(s2file, s2temppath)
def s2query(self): self.conf.setchannel(SPIDER_CHANNEL_S2) s2file = SpiderConfigure.getinstance().gets2file() file = FileUtility.getfilename(s2file) s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) + file if FileUtility.exists(s2temppath): with open(s2temppath, 'r') as fp: querylist = [] firstline = True for strquery in fp.readlines(): if firstline: firstline = False if strquery[:3] == codecs.BOM_UTF8: Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file)) strquery = strquery[3:] strquery = Common.strip(strquery) if not strquery: continue Logger.getlogging().info('S2 {query} start...'.format(query=strquery)) self.conf.setquery(strquery) URLStorage.updaterecycle() querylist.append(strquery) for site in self.factory.getall(): site.s2query(strquery.replace('&', ' ')) sitelist = [] for site in self.factory.getall(): if site.exists2(): sitelist.append(site) SpiderReport.loadquery(querylist) SpiderReport.loadsites(sitelist)
def __init__(self, taskinfo=None, download_path=None): self.taskinfo = taskinfo self.maxfilenum = 100 self.cache_path = Storage.getstoragelocation( const.SPIDER_DONE_TEMP_PATH) path = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH) if download_path: self.download_path = download_path else: self.download_path = PUCDownloader.DOWNLOAD_PATH.format( path=path, taskid=self.taskinfo.taskid) self.parse_tool = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG) #self.json_path = Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH) self.pucbackpath = SpiderConfigure.getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_PUC_BACKUP_PATH) + self.taskinfo.taskid self.pucbacktoday = os.path.join(self.pucbackpath, TimeUtility.getcurrentdate()) if not FileUtility.exists(self.pucbackpath): FileUtility.mkdirs(self.pucbackpath) if not FileUtility.exists(self.pucbacktoday): FileUtility.mkdirs(self.pucbacktoday) self.done_file = self.pucbacktoday + '/done/' self.json_path = self.pucbacktoday + '/json/' if not FileUtility.exists(self.done_file): FileUtility.mkdirs(self.done_file) if not FileUtility.exists(self.json_path): FileUtility.mkdirs(self.json_path) self.pucsavedays = 0 self.clear()
def getqueryfromdb(self): #指定s2 query输出文件路径 s2file = SpiderConfigure.getinstance().gets2file() temppath = Storage.getstoragelocation( const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file) QueryStorage.getinstance().getlocalquerys( temppath, ETLController.LOCALMACHINEFLAG) if FileUtility.exists(temppath): return temppath
def gettiebaqueryfromdb(self): #指定s2 query输出文件路径 tiebafile = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_S3_INPUT_FILE) temppath = Storage.getstoragelocation( const.SPIDER_TIEBA_TEMP_PATH) + FileUtility.getfilename(tiebafile) QueryStorage.getinstance().getlocalquerys_tieba( temppath, ETLController.LOCALMACHINEFLAG) if FileUtility.exists(temppath): return temppath
def __init__(self, taskinfo): self.taskinfo = taskinfo self.upload_url = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_UPLOAD_URL) self.cache_path = Storage.getstoragelocation( const.SPIDER_DONE_TEMP_PATH) path = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH) self.download_path = TencentDownloader.DOWNLOAD_PATH.format( path=path, taskid=self.taskinfo.taskid) self.parse_tool = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL) self.parse_tool_img = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG) self.json_path = Storage.getstoragelocation( const.SPIDER_JSON_TEMP_PATH) self.upload_file_list = {} self.recycle_times = 0 self.download_file_list = [] self.download_file_list2 = [] self.retrytimes = int( SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_UPLOAD_RETRY_TIMES)) # 新添加的变量 self.uploadfile_retranslist = {} self.outtimelimit = int( SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_WAIT_PLATFORM_TIMEOUT)) # self.outtimelimit = 10 self.download_time = time.time() self.taskstatusflag = True self.start_time = 0
def __init__(self, info): self.token = info.token self.appid = info.appid self.taskname = info.taskname self.jobid = '' #self.times = time.strftime('%Y%m%d',time.localtime()) ts = 'start={start}000000&end={end}235959' t1 = time.strftime('%Y%m%d', time.localtime(time.time() - 60 * 60 * 24 * 1)) t2 = time.strftime('%Y%m%d', time.localtime(time.time() - 60 * 60 * 24 * 8)) if int(self.appid) == 180: self.times = ts.format(start=t2, end=t1) else: self.times = ts.format(start=t1, end=t1) self.download_path = Storage.getstoragelocation( const.SPIDER_WAIBU_TEMP_PATH)
def __init__(self): self.upload_file_list = {} self.impls = [] self.implsindex = 0 self.initcommon() self.wimpls = [] self.wimplsindoex = 0 self.initwebkit() self.limpls = [] self.limplsindex = 0 self.initlocal() self.tempurlpath = Storage.getstoragelocation( const.SPIDER_URLS_TEMP_PATH) self.urlbackuppath = SpiderConfigure.getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_URL_BACKUP_PATH) + TimeUtility.getcurrentdate() #文件下载失败重试机制 self.retransmissionfiles = {} self.all_retransmissionfiles = {} self.retransmissionlimitnum = 3 self.filetime = 0
def copyfiles(self): # s1/s2输入路径 s1file = SpiderConfigure.getinstance().gets1file() s2file = SpiderConfigure.getinstance().gets2file() # s1/s2历史路径 self.conf.setchannel(SPIDER_CHANNEL_S1) # s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) if FileUtility.exists(s1file): lines = 0 firstline = True with open(s1file, 'r') as fp: rows = [] for line in fp.readlines(): line = line.strip() if firstline: firstline = False if line[:3] == codecs.BOM_UTF8: Logger.getlogging().warning( 'Remove BOM from {file}!'.format(file=file)) line = line[3:] if line: lines += 1 rows.append(line) if lines % constant.SPIDER_S1_MAX_LINE_PER_FILE == 0: s1tempfile = URLFileManager.generateurlfilepath( ) + constant.WEBKIT_FILE_SUFFIX FileUtility.writelines(s1tempfile, rows) rows = [] if rows: s1tempfile = URLFileManager.generateurlfilepath( ) + constant.WEBKIT_FILE_SUFFIX FileUtility.writelines(s1tempfile, rows) rows = [] if FileUtility.exists(s2file): FileUtility.copy(s2file, s2temppath)
def dumpurls(self): #dump本台机器query对应的urllsit, 并存储到对应的文件中 s2file = SpiderConfigure.getinstance().gets2file() s2temppath = Storage.getstoragelocation( const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file) #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG) querys = QueryStorage.getinstance().getlocalquerys( s2temppath, ETLController.LOCALMACHINEFLAG) for query in querys: Logger.getlogging().debug( 'Now, Starting Select url to Insert and Update for uploading location urlfile!' ) self.conf.setchannel(constant.SPIDER_CHANNEL_S2) self.conf.setquery(query) #此处注释请勿删除 #1.转换周期内数据 # 1.1pulishdate存在,时间为最近一周 # 2.1publistdate为0,使用创建时间,时间为最近一周 #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \ #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \ #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))' #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG, #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"', #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #starttime = SpiderConfigure.getinstance().starttime(), #time0='\"'+TimeUtility.getuniformtime(0)+'\"', #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"', #secs =self.period * 24*60*60 #) where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG } Logger.getlogging().debug( 'Query condition: {where}'.format(where=str(where))) results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urltemplist = [] for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE] createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \ (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60): if url not in urltemplist: urltemplist.append(url) params = PageBasicInfo() params.url = url NewsStorage.seturlinfos(params) #2.抽取createdate为本次开始时间的数据 URLFileManager.getinstance().generateurlfilepath() where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG, SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE: SpiderConfigure.getinstance().starttime() } results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urllist = [] linecount = 0 for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() urllist.append(url) context = URLContext() context.originalurl = url context.type = URLContext.S1_MAIN_BODY context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[ SQLDAO.SPIDER_TABLE_NEWS_TYPE] Logger.getlogging().debug(url) URLManager.getinstance().storeurl(url, context, REQUEST_TYPE_WEBKIT) linecount += 1
def upload(self): FileUtility.flush() upfiles = FileUtility.getfilelist( Storage.getstoragelocation(const.SPIDER_URLS_TEMP_PATH), []) return self.downloader.upload(upfiles)