def retrydownload(self, jsonfile, urlset): Logger.getlogging().warning( 'upload failed urls {num}'.format(num=len(urlset))) context = URLFileManager.getinstance().geturlfilecontext( FileUtility.getfilename(jsonfile)) if context.retry >= 2: Logger.getlogging().error('do not upload for failed again') for key in urlset.keys(): Logger.getlogging().error( 'download {url} failed'.format(url=key)) else: urls = [] for key in urlset.keys(): Logger.getlogging().warning( 'retry download {url}'.format(url=key)) for i in range(0, urlset[key]): urls.append(key) newurlfile = URLFileManager.getinstance().generateurlfilepath( context.retry + 1) Logger.getlogging().warning( 'Retry download URL {file}'.format(file=newurlfile)) if constant.POST_FILE_SUFFIX in jsonfile: URLManager.getinstance().storeurls(urls, constant.REQUEST_TYPE_POST) elif constant.WEBKIT_FILE_SUFFIX in jsonfile: URLManager.getinstance().storeurls( urls, constant.REQUEST_TYPE_WEBKIT) else: URLManager.getinstance().storeurls( urls, constant.REQUEST_TYPE_COMMON)
def preprocess(self, filepath): result = False context = URLFileManager.getinstance().geturlfilecontext( FileUtility.getfilename(filepath)) if context: self.conf.setchannel(context.channel) if context.channel == SPIDER_CHANNEL_S2: self.conf.setquery(context.query) else: self.conf.setquery('') URLFileManager.getinstance().generateurlfilepath() result = True return result
def s2upload(self, sfile): if FileUtility.exists(sfile): lines = FileUtility.readlines(sfile) for line in lines: try: query = line.strip() self.conf.setchannel(SPIDER_CHANNEL_S2) self.conf.setquery(query) URLFileManager.getinstance().generateurlfilepath() allsite = self.factory.getall() for site in allsite: site.s2query(query) except: Logger.printexception()
def renewfilename(self, file): """""" filename = FileUtility.getfilename(file) context = URLFileManager.getinstance().geturlfilecontext(filename) if not context: return False if self.filetime == int(time.time()): time.sleep(1) self.filetime = int(time.time()) newfilename = filename.replace( re.findall('\d+', filename)[-1], str(self.filetime)) urlsfile = self.tempurlpath + newfilename context.filename = urlsfile URLFileManager.getinstance().updateurlfilecontext( FileUtility.getfilename(urlsfile), context) return urlsfile
def s3upload(self, tiebafile): lines = FileUtility.readlines(tiebafile) querylist = [] sitelist = [] self.conf.setchannel(SPIDER_CHANNEL_S2) for strquery in lines: query = strquery.split('\t')[0].strip() url = strquery.split('\t')[1].strip() Logger.getlogging().debug(query) Logger.getlogging().debug(url) self.conf.setquery(query) URLFileManager.getinstance().generateurlfilepath() querylist.append(query) site = self.factory.getsite(url) site.s2query(url) if site not in sitelist: sitelist.append(site)
def s1upload(self, sfile): if FileUtility.exists(sfile): lines = FileUtility.readlines(sfile) self.conf.setchannel(SPIDER_CHANNEL_S1) self.conf.setquery('') URLFileManager.getinstance().generateurlfilepath() for line in lines: try: url = line.strip() params = PageBasicInfo() params.url = url #NewsStorage.seturlinfos(params) context = URLContext() context.originalurl = url context.type = URLContext.S1_MAIN_BODY Logger.getlogging().debug(url) URLManager.getinstance().storeurl(url, context, REQUEST_TYPE_WEBKIT) except: Logger.printexception()
def copyfiles(self): # s1/s2输入路径 s1file = SpiderConfigure.getinstance().gets1file() s2file = SpiderConfigure.getinstance().gets2file() # s1/s2历史路径 self.conf.setchannel(SPIDER_CHANNEL_S1) # s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) if FileUtility.exists(s1file): lines = 0 firstline = True with open(s1file, 'r') as fp: rows = [] for line in fp.readlines(): line = line.strip() if firstline: firstline = False if line[:3] == codecs.BOM_UTF8: Logger.getlogging().warning( 'Remove BOM from {file}!'.format(file=file)) line = line[3:] if line: lines += 1 rows.append(line) if lines % constant.SPIDER_S1_MAX_LINE_PER_FILE == 0: s1tempfile = URLFileManager.generateurlfilepath( ) + constant.WEBKIT_FILE_SUFFIX FileUtility.writelines(s1tempfile, rows) rows = [] if rows: s1tempfile = URLFileManager.generateurlfilepath( ) + constant.WEBKIT_FILE_SUFFIX FileUtility.writelines(s1tempfile, rows) rows = [] if FileUtility.exists(s2file): FileUtility.copy(s2file, s2temppath)
def processfile(self, jsonfile): if not self.preprocess(jsonfile): return method = self.requesttype(jsonfile) urls = self.backupfile(jsonfile) context = URLFileManager.getinstance().geturlfilecontext( FileUtility.getfilename(jsonfile)) with open(jsonfile, 'r') as fp: lines = fp.readlines() for line in lines: param = self.analysis(line, method) if param is None: continue url = param.url if context.retry >= 2: param.lastretry = True if method == constant.REQUEST_TYPE_POST: url = json.dumps({'url': param.url, 'data': param.data}) info = None if URLManager.getinstance().exist(url): info = URLManager.getinstance().geturlcontext(url) param.originalurl = info.originalurl param.step = info.step param.type = info.type param.customized = info.customized else: param.originalurl = param.url param.type = URLContext.S1_MAIN_BODY if SiteS2Query.REFER_URL in param.customized: site = self.factory.getsite( param.customized[SiteS2Query.REFER_URL]) else: site = self.factory.getsite(param.originalurl) res = site.process(param) if not res: if info: URLManager.getinstance().seturlcontext(param.url, info) else: if url in urls: urls[url] -= 1 if urls[url] == 0: urls.pop(url) # upload failed urls if urls: self.retrydownload(jsonfile, urls)
def dumpurls(self): #dump本台机器query对应的urllsit, 并存储到对应的文件中 s2file = SpiderConfigure.getinstance().gets2file() s2temppath = Storage.getstoragelocation( const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file) #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG) querys = QueryStorage.getinstance().getlocalquerys( s2temppath, ETLController.LOCALMACHINEFLAG) for query in querys: Logger.getlogging().debug( 'Now, Starting Select url to Insert and Update for uploading location urlfile!' ) self.conf.setchannel(constant.SPIDER_CHANNEL_S2) self.conf.setquery(query) #此处注释请勿删除 #1.转换周期内数据 # 1.1pulishdate存在,时间为最近一周 # 2.1publistdate为0,使用创建时间,时间为最近一周 #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \ #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \ #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))' #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG, #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"', #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #starttime = SpiderConfigure.getinstance().starttime(), #time0='\"'+TimeUtility.getuniformtime(0)+'\"', #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"', #secs =self.period * 24*60*60 #) where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG } Logger.getlogging().debug( 'Query condition: {where}'.format(where=str(where))) results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urltemplist = [] for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE] createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \ (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60): if url not in urltemplist: urltemplist.append(url) params = PageBasicInfo() params.url = url NewsStorage.seturlinfos(params) #2.抽取createdate为本次开始时间的数据 URLFileManager.getinstance().generateurlfilepath() where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG, SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE: SpiderConfigure.getinstance().starttime() } results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urllist = [] linecount = 0 for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() urllist.append(url) context = URLContext() context.originalurl = url context.type = URLContext.S1_MAIN_BODY context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[ SQLDAO.SPIDER_TABLE_NEWS_TYPE] Logger.getlogging().debug(url) URLManager.getinstance().storeurl(url, context, REQUEST_TYPE_WEBKIT) linecount += 1