def load_argv(self, argv): if len(argv) == 6 and not any(map(lambda flag : flag.startswith('-'), argv)): _parse_old_argv(self, argv) return valid_flags = ['--clean'] valid_kv_settings = [ '--cfg', '-cfg', '--config', '--workdir', '--webstagedir', '--glideinwmsdir', '--gridmapfile', '--mapfile', '--gridmap', '-url', '--weburl', '--url' ] # Our flags are not case sensitive; first, remove the program's name, then # map them into the correct format. normed_argv = list( map(lambda arg: arg.strip().lower() if arg.startswith('--') else arg, argv[1:]) ) flags = parse_argv(normed_argv, valid_flags=valid_flags, valid_kv_settings=valid_kv_settings) self.shouldClean = flags.get('--clean') self.cfgFile = flags.get('--cfg') or flags.get('--config') or flags.get('-cfg') self.workDir = flags.get('--workdir') self.webStageDir = flags.get('--webstagedir') self.glideinWMSDir = flags.get('--glideinwmsdir') self.gridmapFile = flags.get('--gridmapfile') or flags.get('--gridmap') or flags.get('--mapfile') self.webURL = flags.get('--weburl') or flags.get('--url') or flags.get('-url') self.load_cfg() self._setup_logger() ilog('Created Ilan WebStructBuilder: %s'%str(self))
def __init__(self,argv): # glideTester.cfg values self.runId=None self.glideinWMSDir = None self.configDir = None self.proxyFile = None self.pilotFile = None self.delegateProxy = None self.collectorNode = None self.gfactoryNode = None self.gfactoryConstraint = None self.gfactoryClassadID = None self.myClassadID = None self.mySecurityName = None # parameters.cfg values self.executable = None self.inputFile = None self.outputFile = None self.environment = None self.getenv = None self.arguments = None self.x509userproxy = None self.concurrencyLevel = None self.runs = 1 self.gfactoryAdditionalConstraint=None self.additionalClassAds = [] # parse arguments valid_keys = ['-config', '-cfg', '--config', '-params', '-runId'] arg_map = parse_argv(argv[1:], valid_kv_settings=valid_keys) passed_config_path = arg_map.get('-cfg') or arg_map.get('--config') or arg_map.get('-config') passed_params_path = arg_map.get('-params') self.cfg_paths = get_config_file_list(file_name='glideTester.cfg', arg_path=passed_config_path) self.params_path = get_config_file_list(file_name='parameters.cfg', arg_path=passed_params_path) self.runId = arg_map.get('-runId') # check and fix the attributes if self.runId==None: # not defined, create one specific for the account # should not be too random, or you polute the factory namespace self.runId="u%i"%os.getuid() # load external values self.load_cfg() self.verify_cfg() # set search path if self.glideinWMSDir is not None: sys.path.insert(0, self.glideinWMSDir) sys.path.insert(0,os.path.join(self.glideinWMSDir,"..")) self.load_config_dir() self.load_params() self.setup_logger() ilog("Made glideTester: \n\n%s\n"%dbgp(self, 4))
def cleanStructs(self): import shutil if self.workDir is not None and os.path.exists(self.workDir): ilog("Cleaning old workdir: %s" % self.workDir) shutil.rmtree(self.workDir) if self.webStageDir is not None and os.path.exists(self.webStageDir): ilog("Cleaning old webStageDir: %s" % self.webStageDir) shutil.rmtree(self.webStageDir)
def cleanStructs(self): import shutil if self.workDir is not None and os.path.exists(self.workDir): ilog("Cleaning old workdir: %s"%self.workDir) shutil.rmtree(self.workDir) if self.webStageDir is not None and os.path.exists(self.webStageDir): ilog("Cleaning old webStageDir: %s"%self.webStageDir) shutil.rmtree(self.webStageDir)
def reload_proxy(self): ilog('Reloading proxy from fname: %s'%str(self.proxy_fname)) if self.proxy_fname==None: self.proxy_data=None return proxy_fd=open(self.proxy_fname,'r') try: self.proxy_data=proxy_fd.read() (self.public_cert, self.private_cert) = self._parse_proxy_certs(self.proxy_data) finally: proxy_fd.close() return
def reload_proxy(self): ilog('Reloading proxy from fname: %s' % str(self.proxy_fname)) if self.proxy_fname == None: self.proxy_data = None return proxy_fd = open(self.proxy_fname, 'r') try: self.proxy_data = proxy_fd.read() (self.public_cert, self.private_cert) = self._parse_proxy_certs(self.proxy_data) finally: proxy_fd.close() return
def cleanup_glideins(self): ilog('Thread is cleaning up glideins.') from glideinwms.frontend import glideinFrontendInterface from glideinwms.lib import condorMonitor, condorExe # Deadvertize my add, so the factory knows we are gone for factory_pool in self.factory_pools: factory_pool_node=factory_pool[0] ilog('Deadvertising for node %s'%dbgp(factory_pool_node)) try: glideinFrontendInterface.deadvertizeAllWork(factory_pool_node,self.client_name) except RuntimeError, e: self.errors.append((time.time(),"Deadvertizing failed: %s"%e)) except:
def cleanup_glideins(self): ilog('Thread is cleaning up glideins.') from glideinwms.frontend import glideinFrontendInterface from glideinwms.lib import condorMonitor, condorExe # Deadvertize my add, so the factory knows we are gone for factory_pool in self.factory_pools: factory_pool_node = factory_pool[0] ilog('Deadvertising for node %s' % dbgp(factory_pool_node)) try: glideinFrontendInterface.deadvertizeAllWork( factory_pool_node, self.client_name) except RuntimeError, e: self.errors.append( (time.time(), "Deadvertizing failed: %s" % e)) except:
def load_argv(self, argv): if len(argv) == 6 and not any( map(lambda flag: flag.startswith('-'), argv)): _parse_old_argv(self, argv) return valid_flags = ['--clean'] valid_kv_settings = [ '--cfg', '-cfg', '--config', '--workdir', '--webstagedir', '--glideinwmsdir', '--gridmapfile', '--mapfile', '--gridmap', '-url', '--weburl', '--url' ] # Our flags are not case sensitive; first, remove the program's name, then # map them into the correct format. normed_argv = list( map( lambda arg: arg.strip().lower() if arg.startswith('--') else arg, argv[1:])) flags = parse_argv(normed_argv, valid_flags=valid_flags, valid_kv_settings=valid_kv_settings) self.shouldClean = flags.get('--clean') self.cfgFile = flags.get('--cfg') or flags.get( '--config') or flags.get('-cfg') self.workDir = flags.get('--workdir') self.webStageDir = flags.get('--webstagedir') self.glideinWMSDir = flags.get('--glideinwmsdir') self.gridmapFile = flags.get('--gridmapfile') or flags.get( '--gridmap') or flags.get('--mapfile') self.webURL = flags.get('--weburl') or flags.get('--url') or flags.get( '-url') self.load_cfg() self._setup_logger() ilog('Created Ilan WebStructBuilder: %s' % str(self))
def createStructs(self): ilog('Running createStructs for builder: %s' % str(self)) self._setup_gwms_path() import cgkWDictFile try: import inspect srcf = inspect.getsourcefile(cgkWDictFile) except: srcf = 'ERROR' ilog("Imported cgkWDictFile from %s" % srcf) #Create the config files ilog('Creating struct.') dicts = cgkWDictFile.glideKeeperDicts(self.workDir, self.webStageDir) dicts.populate(self.webURL, self.gridmapFile) dicts.create_dirs() dicts.save() self._create_empty_web_index() ilog('Done.') print "Created config files in %s\n" % dicts.work_dir print "Web files in %s" % dicts.stage_dir print "If needed, move them so they are accessible from\n %s" % self.webURL
def createStructs(self): ilog('Running createStructs for builder: %s'%str(self)) self._setup_gwms_path() import cgkWDictFile try: import inspect srcf = inspect.getsourcefile(cgkWDictFile) except: srcf = 'ERROR' ilog("Imported cgkWDictFile from %s"%srcf) #Create the config files ilog('Creating struct.') dicts=cgkWDictFile.glideKeeperDicts(self.workDir,self.webStageDir) dicts.populate(self.webURL,self.gridmapFile) dicts.create_dirs() dicts.save() self._create_empty_web_index() ilog('Done.') print "Created config files in %s\n"%dicts.work_dir print "Web files in %s"%dicts.stage_dir print "If needed, move them so they are accessible from\n %s"%self.webURL
def __init__(self, web_url,descript_fname,descript_signature, group_name,group_descript_fname,group_descript_signature, security_name,instance_id, classad_id, factory_pools,factory_constraint, collector_node, proxy_fname, session_id=None): # session_id should be a uniq string ilog("Initting new GlideKeeperThread.") threading.Thread.__init__(self) # consts self.signature_type = "sha1" self.max_request=100 # strings, describe Web downloadable info self.web_url=web_url self.descript_fname=descript_fname self.descript_signature=descript_signature ilog("Thread web info: \n\tweb_url: %s\n\tdescript_fname: %s\n\tdescript_signature: %s"%(web_url, descript_fname, descript_signature)) self.group_name=group_name self.group_descript_fname=group_descript_fname self.group_descript_signature=group_descript_signature ilog("Thread group info: \n\tgroup_name: %s\n\tdescript_fname: %s\n\tdescript_signature: %s"%(group_name, group_descript_fname, group_descript_signature)) # string, used for identification self.security_name=security_name self.instance_id=instance_id glidekeeper_id="%s_%s"%(security_name,instance_id) self.glidekeeper_id=glidekeeper_id client_name="%s.%s"%(glidekeeper_id,self.group_name) self.client_name=client_name ilog('Thread security info: \n\tsecurity_name: %s\n\tinstance_id: %s\n\tglidekeeper_id: %s\n\tclient_name: %s'%(security_name, instance_id, glidekeeper_id, client_name)) if session_id==None: # should be as unique as possible # in the context of the instance_id session_id="%s_%s"%(time.time(),os.getpid()) self.session_id=session_id ilog('Thread session_id: %s'%session_id) self.instance_constraint='GLIDETESTER_InstanceID=?="%s"'%self.glidekeeper_id self.session_constraint='GLIDETESTER_SessionID=?="%s"'%self.session_id self.glidekeeper_constraint="(%s)&&(%s)"%(self.instance_constraint,self.session_constraint) ilog('Thread glidein constraints: %s'%self.glidekeeper_constraint) # string, what our ads will be identified at the factories self.classad_id=classad_id ilog('Thread classad_id: %s'%classad_id) # factory pools is a list of pairs, where # [0] is factory node # [1] is factory identity self.factory_pools=factory_pools # string or None self.factory_constraint=factory_constraint # string self.collector_node = collector_node self.proxy_fname=proxy_fname self.reload_proxy() # provides proxy_data ilog('Backend info:\n\tfactory_pools: %s\n\tfactory_constraint: %s\n\tcollector_node: %s\n\tproxy_fname: %s'%(dbgp(factory_pools), factory_constraint, collector_node, proxy_fname)) ############################# # keep it simple, start with 0, requests will come later self.needed_glideins=0 self.need_cleanup = False # if never requested more than 0, then no need to do cleanup self.running_glideins=0 self.errors=[] ############################## self.shutdown=False
def go_request_glideins(self): ilog('Entered go_request_glideins.') from glideinwms.frontend import glideinFrontendInterface from glideinwms.lib import condorMonitor, condorExe, pubCrypto from glideinwms.frontend.glideinFrontendPlugins import proxy_plugins, createCredentialList # query job collector ilog('Checking the condor pool.') try: pool_status = condorMonitor.CondorStatus() pool_status.load( '(IS_MONITOR_VM=!=True)&&(%s)' % self.glidekeeper_constraint, [('State', 's')]) running_glideins = len(pool_status.fetchStored()) del pool_status self.running_glideins = running_glideins ilog('Found %d glideins in the pool.' % running_glideins) except: self.errors.append((time.time(), "condor_status failed")) return # query WMS collector ilog('Checking factory glideins.') glidein_dict = {} for factory_pool in self.factory_pools: factory_pool_node = factory_pool[0] factory_identity = factory_pool[1] try: if self.proxy_data != None: full_constraint = self.factory_constraint + ' && (PubKeyType=?="RSA") && (GlideinAllowx509_Proxy=!=False)' else: full_constraint = self.factory_constraint + ' && (GlideinRequirex509_Proxy=!=True)' ilog( 'Running findGlideins with these params: \n\tpool: %s\n\tident: %s\n\tsigtype: %s\n\tconstraints: %s' % ( str(factory_pool_node), str(None), str(self.signature_type), str(full_constraint) #str(self.proxy_data!=None), #str(True) )) factory_glidein_dict = glideinFrontendInterface.findGlideins( factory_pool_node, None, #factory_identity, #TODO: How do we authenticate with the factory? self.signature_type, full_constraint #self.proxy_data!=None, #get_only_matching=True ) except RuntimeError, e: factory_glidein_dict = { } # in case of error, treat as there is nothing there ilog('Error from findGlideins: %s' % str(e)) ilog('Found %d possible in factory_pool %s' % (len(factory_glidein_dict.keys()), dbgp(factory_pool))) for glidename in factory_glidein_dict.keys(): ilog('Now testing glidein with name %s' % glidename) glidein_el = factory_glidein_dict[glidename] ilog('Glidein stats: \n\n %s \n\n' % dbgp(glidein_el)) if not glidein_el['attrs'].has_key( 'PubKeyType'): # no pub key at all, skip ilog('%s has no PubKeyType -- skipping.' % glidename) continue elif glidein_el['attrs'][ 'PubKeyType'] == 'RSA': # only trust RSA for now try: # augment glidein_el['attrs']['PubKeyObj'] = pubCrypto.PubRSAKey( str( re.sub(r"\\+n", r"\n", glidein_el['attrs']['PubKeyValue']))) # and add glidein_dict[(factory_pool_node, glidename)] = glidein_el ilog('Adding %s to glidein_dict' % glidename) except RuntimeError, e: ilog('Hit error when adding %s to glidein_dict:\n%s' % (glidename, str(e))) continue # skip except:
data=check1.fetchStored() ilog('Success!') except RuntimeError,e: main_log.write("%s %s\n"%(ctime(), "condor_q failed (%s)... ignoring for now"%e)) main_log.flush() sleep(2) continue # retry the while loop except: main_log.write("%s %s\n"%(ctime(), "condor_q failed (reason unknown)... ignoring for now")) main_log.flush() sleep(2) continue # retry the while loop runningGlideins = len(data) ilog('Found %s jobs running.'%len(data.keys())) main_log.write("%s %s %s\n"%(ctime(), runningGlideins, 'jobs running')) main_log.flush() if runningGlideins == 0: main_log.write("%s %s\n"%(ctime(), "no more running jobs")) break else: sleep(10) def parse_result(config,workingDir,concurrencyLevel): # Create a loop to parse each log file into a summaries directory summDir = workingDir + '/summaries/' os.makedirs(summDir) for l in range(0, config.runs, 1): for k in range(0, len(concurrencyLevel), 1):
def parse_result(config,workingDir,concurrencyLevel): # Create a loop to parse each log file into a summaries directory summDir = workingDir + '/summaries/' os.makedirs(summDir) for l in range(0, config.runs, 1): for k in range(0, len(concurrencyLevel), 1): # Initialize empty arrays for data results=[] hours=[] minutes=[] seconds=[] jobStartInfo=[] jobExecuteInfo=[] jobFinishInfo=[] jobStatus=[] # Parse each log file logFile = workingDir + '/con_' + concurrencyLevel[k] + '_run_' + str(l) + '.log' if not os.path.exists(logFile): # If the log file doesn't exist, then the run failed. # Report that in the summaries. filePath = summDir + 'con_' + concurrencyLevel[k] + '_run_' + str(l) + '.txt' file=open(filePath, 'w') header = "# Test Results for " + config.executable + " run at concurrency Level " + concurrencyLevel[k] + '\n\nJob\tExec\tFinish\tReturn\nNumber\tTime\tTime\tValue\n' file.write(header) file.write('#ERROR: Could not read log file. Did this level actually run?') file.close() filepath = summDir + 'results.txt' file=open(filepath, 'a') times = "Concurrency_Level = " + concurrencyLevel[k] + "\t Execute_Time_(Ave/Min/Max) = " + 'ERROR: Failed' + '/' + 'ERROR: Failed' + '/' + 'ERROR: Failed' + "\t Finish_Time_(Ave/Min/Max) = " + 'ERROR: Failed' + "/" + 'ERROR: Failed' + "/" + 'ERROR: Failed' + '\n' file.write(times) file.close() continue lf = open(logFile, 'r') try: lines1 = lf.readlines() finally: lf.close() jobsSubmitted = 0 for line in lines1: line = line.strip() if line[0:1] not in ('0','1','2','3','4','5','6','7','8','9','('): continue # ignore unwanted text lines arr1=line.split(' ',7) if len(arr1) < 5: ilog('ERROR: Line too small for parsing: %s'%(str(arr1))) if arr1[5] == "Bytes" or arr1[4] =="Image": continue if arr1[5] == "submitted": jobNum = arr1[1].strip('()') jobStartInfo.append(jobNum) jobStartInfo.append(arr1[3]) jobsSubmitted=jobsSubmitted+1 if arr1[5] == "executing": jobNum = arr1[1].strip('()') jobExecuteInfo.append(jobNum) jobExecuteInfo.append(arr1[3]) if arr1[5] == "terminated.": jobNum = arr1[1].strip('()') jobFinishInfo.append(jobNum) jobFinishInfo.append(arr1[3]) if arr1[4] == "value": status=arr1[5].split(')',1) jobFinishInfo.append(status[0]) # Set some variables minExeTime=1e20 maxExeTime=0 minFinTime=1e20 maxFinTime=0 iter=0 for i in range(0, len(jobStartInfo), 2): if jobStartInfo[i] in jobExecuteInfo: index = jobExecuteInfo.index(jobStartInfo[i]) timeJobStart = jobStartInfo[i + 1] timeJobExecute = jobExecuteInfo[index + 1] timeStart = timeJobStart.split(':', 2) timeExecute = timeJobExecute.split(':', 2) diffHours = (int(timeExecute[0]) - int(timeStart[0])) * 3600 diffMinutes = (int(timeExecute[1]) - int(timeStart[1])) * 60 diffSeconds = int(timeExecute[2]) - int(timeStart[2]) executeTime = diffHours + diffMinutes + diffSeconds index2 = jobFinishInfo.index(jobStartInfo[i]) timeJobFinish = jobFinishInfo[index2 + 1] stat = jobFinishInfo[index2 +2] timeFinish = timeJobFinish.split(':', 2) diffHours2 = (int(timeFinish[0]) - int(timeExecute[0])) * 3600 diffMinutes2 = (int(timeFinish[1]) - int(timeExecute[1])) * 60 diffSeconds2 = int(timeFinish[2]) - int(timeExecute[2]) finishTime = diffHours2 + diffMinutes2 + diffSeconds2 resultData = [iter, executeTime, finishTime, stat] results.append(resultData) iter = iter + 1 if executeTime > maxExeTime: maxExeTime = executeTime if executeTime < minExeTime: minExeTime = executeTime if finishTime > maxFinTime: maxFinTime = finishTime if finishTime < minFinTime: minFinTime = finishTime # Create summary directory structure filePath = summDir + 'con_' + concurrencyLevel[k] + '_run_' + str(l) + '.txt' file=open(filePath, 'w') header = "# Test Results for " + config.executable + " run at concurrency Level " + concurrencyLevel[k] + '\n\nJob\tExec\tFinish\tReturn\nNumber\tTime\tTime\tValue\n' file.write(header) exeTime=0 finTime=0 for i in range(0, int(concurrencyLevel[k])): exeTime = exeTime + results[i][1] finTime = finTime + results[i][2] writeData = str(results[i][0]) + '\t' + str(results[i][1]) + '\t' + str(results[i][2]) + '\t' + results[i][3] + '\n' file.write(writeData) aveExeTime = exeTime/int(concurrencyLevel[k]) aveFinTime = finTime/int(concurrencyLevel[k]) file.close() filepath = summDir + 'results.txt' file=open(filepath, 'a') times = "Concurrency_Level = " + concurrencyLevel[k] + "\t Execute_Time_(Ave/Min/Max) = " + str(aveExeTime) + '/' + str(minExeTime) + '/' + str(maxExeTime) + "\t Finish_Time_(Ave/Min/Max) = " + str(aveFinTime) + "/" + str(minFinTime) + "/" + str(maxFinTime) + '\n' file.write(times) file.close()
def __init__(self, web_url, descript_fname, descript_signature, group_name, group_descript_fname, group_descript_signature, security_name, instance_id, classad_id, factory_pools, factory_constraint, collector_node, proxy_fname, session_id=None): # session_id should be a uniq string ilog("Initting new GlideKeeperThread.") threading.Thread.__init__(self) # consts self.signature_type = "sha1" self.max_request = 100 # strings, describe Web downloadable info self.web_url = web_url self.descript_fname = descript_fname self.descript_signature = descript_signature ilog( "Thread web info: \n\tweb_url: %s\n\tdescript_fname: %s\n\tdescript_signature: %s" % (web_url, descript_fname, descript_signature)) self.group_name = group_name self.group_descript_fname = group_descript_fname self.group_descript_signature = group_descript_signature ilog( "Thread group info: \n\tgroup_name: %s\n\tdescript_fname: %s\n\tdescript_signature: %s" % (group_name, group_descript_fname, group_descript_signature)) # string, used for identification self.security_name = security_name self.instance_id = instance_id glidekeeper_id = "%s_%s" % (security_name, instance_id) self.glidekeeper_id = glidekeeper_id client_name = "%s.%s" % (glidekeeper_id, self.group_name) self.client_name = client_name ilog( 'Thread security info: \n\tsecurity_name: %s\n\tinstance_id: %s\n\tglidekeeper_id: %s\n\tclient_name: %s' % (security_name, instance_id, glidekeeper_id, client_name)) if session_id == None: # should be as unique as possible # in the context of the instance_id session_id = "%s_%s" % (time.time(), os.getpid()) self.session_id = session_id ilog('Thread session_id: %s' % session_id) self.instance_constraint = 'GLIDETESTER_InstanceID=?="%s"' % self.glidekeeper_id if len(self.session_id) != 0: self.session_constraint = 'GLIDETESTER_SessionID=?="%s"' % self.session_id self.glidekeeper_constraint = "(%s)&&(%s)" % ( self.instance_constraint, self.session_constraint) else: self.session_constraint = 'TRUE' self.glidekeeper_constraint = self.instance_constraint ilog('Thread glidein constraints: %s' % self.glidekeeper_constraint) # string, what our ads will be identified at the factories self.classad_id = classad_id ilog('Thread classad_id: %s' % classad_id) # factory pools is a list of pairs, where # [0] is factory node # [1] is factory identity self.factory_pools = factory_pools # string or None self.factory_constraint = factory_constraint # string self.collector_node = collector_node self.proxy_fname = proxy_fname self.reload_proxy() # provides proxy_data ilog( 'Backend info:\n\tfactory_pools: %s\n\tfactory_constraint: %s\n\tcollector_node: %s\n\tproxy_fname: %s' % (dbgp(factory_pools), factory_constraint, collector_node, proxy_fname)) ############################# # keep it simple, start with 0, requests will come later self.needed_glideins = 0 self.need_cleanup = False # if never requested more than 0, then no need to do cleanup self.running_glideins = 0 self.errors = [] ############################## self.shutdown = False
def soft_kill(self): ilog('Requesting a soft kill from the thread.') self.shutdown = True
def load_params(self): file_paths = self.params_path for fl in file_paths: config = parse_kv_file(fl) self.load_additional_classads(config) if self.has_params(): continue if self.executable is None: exec_path = config.settings.get('executable') if exec_path is None: pass elif not os.path.exists(exec_path): raise RuntimeError, "%s '%s' is not a valid executable"%('executable',exec_path) else: self.executable = exec_path if self.inputFile is None: input_files = config.settings.get('transfer_input_files') if input_files is not None: arr = input_files.split(',') newarr = [] for f in arr: if not os.path.exists(f): raise RuntimeError, "'%s' is not a valid file"%f newarr.append(os.path.abspath(f)) self.inputFile = string.join(newarr,',') if self.outputFile is None: output_files = config.settings.get('transfer_output_files') if output_files is not None: self.outputFile = output_files if self.environment is None: self.environment = config.settings.get('environment') if self.getenv is None: self.getenv = config.settings.get('getenv') if self.arguments is None: self.arguments = config.settings.get('arguments') if self.x509userproxy is None: val = config.settings.get('x509userproxy') if (val is not None) and (val!='') and (not os.path.exists(val)): raise RuntimeError, "'%s' is not a valid proxy"%val self.x509userproxy = val if self.concurrencyLevel is None: concurrency = config.settings.get('concurrency') self.concurrencyLevel = concurrency.split() if self.runs is None: runs = config.settings.get('runs') if runs is not None: self.runs = int(runs) if self.gfactoryAdditionalConstraint is None: self.gfactoryAdditionalConstraint = config.settings.get('gfactoryAdditionalConstraint') if self.reuseOldGlideins is None: if not 'reuse_old_glideins' in config.settings: ilog(config.settings) continue new_rog_raw = config.settings.get('reuse_old_glideins').strip().lower() if len(new_rog_raw) == 0: self.reuseOldGlideins = True elif new_rog_raw[0] == 't': self.reuseOldGlideins = True else: self.reuseOldGlideins = False if self.jobOutFormat is None: self.jobOutFormat = config.settings.get('initialDirFormat') if self.jobOutFormat is not None: self.verify_job_out_format() if self.prescript is None: self.prescript = config.settings.get('prescript') if self.postscript is None: self.postscript = config.settings.get('postscript') self.verify_prepostscript() if self.runs is None or type(self.runs) != int: self.runs = 1
def process_concurrency(config,gktid,main_log,workingDir,concurrencyLevel,run,k): ilog('Processing concurrency level %s => %s run number %s.\n\tgktid: %s\n\tworkingDir: %s\n\t log: %s'%(str(k), str(concurrencyLevel[k]), str(run), str(gktid), str(workingDir), str(main_log))) from glideinwms.lib import condorMonitor from glideinwms.lib import condorManager # request the glideins # we want 10% more glideins than the concurrency level requestedGlideins = int(concurrencyLevel[k]) totalGlideins = int(requestedGlideins + .1 * requestedGlideins) gktid.request_glideins(totalGlideins) main_log.write("%s %i Glideins requested\n"%(ctime(),totalGlideins)) # now we create the directories for each job and a submit file filename = workingDir + "/" + config.executable.replace('/', '__') + '_concurrency_' + concurrencyLevel[k] + '_run_' + str(run ) + '_submit.condor' filecontent = make_submit_file_content(config, gktid, main_log, workingDir, concurrencyLevel[k], run) condorSubmitFile=open(filename, "w") ilog('Creating condor file %s:\n%s'%(filename, filecontent )) condorSubmitFile.write(filecontent) condorSubmitFile.close() # Need to figure out when we have all the glideins # Ask the glidekeeper object ilog('Now waiting until the thread retrieves enough glideins.') numberGlideins = 0 while numberGlideins < requestedGlideins: errors=[] while 1: # since gktid runs in a different thread, pop is the only atomic operation I have try: errors.append(gktid.errors.pop()) except IndexError: break errors.reverse() if not len(errors) == 0: ilog('Have errors!') for err in errors: main_log.write("%s Error: %s\n"%(ctime(err[0]),err[1])) ilog('Found an error: %s'%err[1]) if not gktid.isAlive(): raise RuntimeError, "The glidekeeper thread unexpectedly died!" numberGlideins = gktid.get_running_glideins() ilog('Currently have %s running glideins out of %s.'%(numberGlideins, requestedGlideins)) main_log.write("%s %s %s %s %s\n"%(ctime(), 'we have', numberGlideins, 'glideins, need', requestedGlideins)) main_log.flush() sleep(5) # Now we begin submission and monitoring ilog('Got the glideins. Now submitting %s.'%filename) submission = condorManager.condorSubmitOne(filename) main_log.write("%s %s\n"%(ctime(), "file submitted")) runningGlideins = numberGlideins while runningGlideins > 0: if gktid.session_id is not None and len(gktid.session_id) > 0: qconstraint = '(JobStatus<3)&&(GK_InstanceId=?="%s")&&(GK_SessionId=?="%s")'%(gktid.glidekeeper_id,gktid.session_id) else: qconstraint = '(JobStatus<3)&&(GK_InstanceId=?="%s")'%(gktid.glidekeeper_id) ilog('Running condorQ to get the running jobs. Constraints: %s'%(qconstraint)) check1 = condorMonitor.CondorQ() try: # i actually want to see all jos, not only running ones check1.load(qconstraint, [("JobStatus","s")]) data=check1.fetchStored() ilog('Success!') except RuntimeError,e: main_log.write("%s %s\n"%(ctime(), "condor_q failed (%s)... ignoring for now"%e)) main_log.flush() sleep(2) continue # retry the while loop except:
def go_request_glideins(self): ilog('Entered go_request_glideins.') from glideinwms.frontend import glideinFrontendInterface from glideinwms.lib import condorMonitor, condorExe from glideinwms.frontend.glideinFrontendPlugins import proxy_plugins, createCredentialList # query job collector ilog('Checking the condor pool.') try: pool_status=condorMonitor.CondorStatus() pool_status.load()#'(IS_MONITOR_VM=!=True)&&(%s)'%self.glidekeeper_constraint,[('State','s')]) running_glideins=len(pool_status.fetchStored()) del pool_status self.running_glideins=running_glideins ilog('Found %d glideins in the pool.'%running_glideins) except: self.errors.append((time.time(),"condor_status failed")) return # query WMS collector ilog('Checking factory glideins.') glidein_dict={} for factory_pool in self.factory_pools: factory_pool_node=factory_pool[0] factory_identity=factory_pool[1] try: if self.proxy_data != None: full_constraint = self.factory_constraint +' && (PubKeyType=?="RSA") && (GlideinAllowx509_Proxy=!=False)' else: full_constraint = self.factory_constraint + ' && (GlideinRequirex509_Proxy=!=True)' ilog('Running findGlideins with these params: \n\tpool: %s\n\tident: %s\n\tsigtype: %s\n\tconstraints: %s'%( str(factory_pool_node), str(None), str(self.signature_type), str(full_constraint) #str(self.proxy_data!=None), #str(True) )) factory_glidein_dict=glideinFrontendInterface.findGlideins( factory_pool_node, None, #factory_identity, #TODO: How do we authenticate with the factory? self.signature_type, full_constraint #self.proxy_data!=None, #get_only_matching=True ) except RuntimeError, e: factory_glidein_dict={} # in case of error, treat as there is nothing there ilog('Error from findGlideins: %s'%str(e)) ilog('Found %d possible in factory_pool %s'%(len(factory_glidein_dict.keys()), dbgp(factory_pool))) for glidename in factory_glidein_dict.keys(): ilog('Now testing glidein with name %s'%glidename) glidein_el=factory_glidein_dict[glidename] ilog('Glidein stats: \n\n %s \n\n'%dbgp(glidein_el)) if not glidein_el['attrs'].has_key('PubKeyType'): # no pub key at all, skip ilog('%s has no PubKeyType -- skipping.'% glidename) continue elif glidein_el['attrs']['PubKeyType']=='RSA': # only trust RSA for now try: # augment glidein_el['attrs']['PubKeyObj']=glideinFrontendInterface.pubCrypto.PubRSAKey(str(re.sub(r"\\+n", r"\n", glidein_el['attrs']['PubKeyValue']))) # and add glidein_dict[(factory_pool_node,glidename)]=glidein_el ilog('Adding %s to glidein_dict'%glidename) except: ilog('Hit error when adding %s to glidein_dict'%glidename) continue # skip else: # invalid key type, skip ilog('%s has invalid PubKeyType -- skipping.'% glidename) continue
def run(config): os.environ['_CONDOR_SEC_DEFAULT_AUTHENTICATION_METHODS']='FS,GSI' os.environ['X509_USER_PROXY']=config.proxyFile import glideKeeper from glideinwms.lib import condorMonitor from glideinwms.lib import condorManager delegated_proxy=None if config.delegateProxy: if config.pilotFile is None: # use the service proxy as a backup solution delegated_proxy=config.proxyFile else: # use the pilto proxy, if available delegated_proxy=config.pilotFile if config.gfactoryAdditionalConstraint==None: gfactoryConstraint=config.gfactoryConstraint else: gfactoryConstraint="(%s)&&(%s)"%(config.gfactoryConstraint,config.gfactoryAdditionalConstraint) session_id_param = None if config.reuseOldGlideins is True: session_id_param = '' gktid=glideKeeper.GlideKeeperThread(config.webURL,config.descriptFile,config.descriptSignature, config.groupName,config.groupDescriptFile,config.groupDescriptSignature, config.mySecurityName,config.runId, config.myClassadID, [(config.gfactoryNode,config.gfactoryClassadID)],gfactoryConstraint, config.collectorNode, delegated_proxy, session_id = session_id_param) gktid.start() startupDir = os.getcwd() workingDir=startupDir + '/run_' + startTime os.makedirs(workingDir) main_log_fname=workingDir + '/glideTester.log' main_log=open(main_log_fname,'w') try: main_log.write("Starting at: %s\n\n"%ctime()) main_log.write("Factory: %s\n"%config.gfactoryNode) main_log.write("Constraint: %s\n"%gfactoryConstraint) main_log.write("Service Proxy: %s\n"%config.proxyFile) main_log.write("Pilot Proxy: %s\n"%delegated_proxy) main_log.write("InstanceID: %s\n"%gktid.glidekeeper_id) main_log.write("SessionID: %s\n\n"%gktid.session_id) concurrencyLevel=config.concurrencyLevel try: prescript_args = { 'wd' : str(workingDir), 'sd' : os.getcwd(), 'ts' : startTime, 'cmd' : ' '.join(sys.argv) } if config.prescript is not None: prescript = construct_from_format(config.prescript, prescript_args) ilog('Running prescript: %s'%(prescript)) err_code = os.system(prescript) if err_code is not 0: msg = 'Bad error code: '+str(err_code) raise RuntimeError(msg) # Create a testing loop for each run for l in range(0, config.runs, 1): main_log.write("Iteration %i\n"%l) # Create a testing loop for each concurrency for k in range(0, len(concurrencyLevel), 1): main_log.write("Concurrency %i\n"%int(concurrencyLevel[k])) process_concurrency(config,gktid,main_log,workingDir,concurrencyLevel,l,k) postscript_args = { 'wd' : str(workingDir), 'sd' : os.getcwd(), 'ts' : startTime, 'cmd' : ' '.join(sys.argv) } if config.postscript is not None: postscript = construct_from_format(config.postscript, postscript_args) ilog('Running postscript: %s'%(postscript)) err_code = os.system(postscript) if err_code is not 0: msg = 'Bad error code: '+str(err_code) raise RuntimeError(msg) main_log.write("%s %s\n"%(ctime(), "Done")) except: tb = traceback.format_exception(sys.exc_info()[0],sys.exc_info()[1], sys.exc_info()[2]) main_log.write("%s %s\n"%(ctime(), "Exception: %s"%string.join(tb,''))) # Now we parse the log files parse_result(config,workingDir,concurrencyLevel) finally: main_log.write("%s %s\n"%(ctime(), "cleaning, then getting out")) main_log.flush() gktid.soft_kill() gktid.join() # print out any last minute errors for err in gktid.errors: main_log.write("%s Error: %s\n"%(ctime(err[0]),err[1])) ilog("%s Error: %s\n"%(ctime(err[0]),err[1])) main_log.write("Terminated at: %s\n"%ctime()) return
def process_concurrency(config,gktid,main_log,workingDir,concurrencyLevel,l,k): ilog('Processing concurrency level %s => %s run number %s.\n\tgktid: %s\n\tworkingDir: %s\n\t log: %s'%(str(k), str(concurrencyLevel[k]), str(l), str(gktid), str(workingDir), str(main_log))) from glideinwms.lib import condorMonitor from glideinwms.lib import condorManager universe = 'vanilla' transfer_executable = "True" when_to_transfer_output = "ON_EXIT" # disable the check for architecture, we are running a script # only match to our own glideins requirements = '(Arch =!= "fake")&&(%s)'%gktid.glidekeeper_constraint owner = 'Undefined' notification = 'Never' # request the glideins # we want 10% more glideins than the concurrency level requestedGlideins = int(concurrencyLevel[k]) totalGlideins = int(requestedGlideins + .1 * requestedGlideins) gktid.request_glideins(totalGlideins) main_log.write("%s %i Glideins requested\n"%(ctime(),totalGlideins)) # now we create the directories for each job and a submit file loop = 0 dir1 = workingDir + '/concurrency_' + concurrencyLevel[k] + '_run_' + str(l) + '/' os.makedirs(dir1) logfile = workingDir + '/con_' + concurrencyLevel[k] + '_run_' + str(l) + '.log' outputfile = 'concurrency_' + concurrencyLevel[k] + '.out' errorfile = 'concurrency_' + concurrencyLevel[k] + '.err' filename = workingDir + "/" + config.executable.replace('/', '__') + '_concurrency_' + concurrencyLevel[k] + '_run_' + str(l) + '_submit.condor' filecontent = '' condorSubmitFile=open(filename, "w") filecontent += ('universe = ' + universe + '\n' + 'executable = ' + config.executable + '\n' + 'transfer_executable = ' + transfer_executable + '\n' + 'when_to_transfer_output = ' + when_to_transfer_output + '\n' + 'Requirements = ' + requirements + '\n' + # '+Owner = ' + owner + '\n' + 'log = ' + logfile + '\n' + 'output = ' + outputfile + '\n' + 'error = ' + errorfile + '\n' + 'notification = ' + notification + '\n' + 'periodic_remove = ((JobStatus!=2)&&(JobRunCount>0))||(JobRunCount>1)\n' + '+GK_InstanceId = "' + gktid.glidekeeper_id + '"\n' + '+GK_SessionId = "' + gktid.session_id + '"\n' + '+IsSleep = 1\n') if config.inputFile != None: filecontent += ('transfer_input_files = ' + config.inputFile + '\n') if config.outputFile != None: filecontent += ('transfer_output_files = ' + config.outputFile + '\n') if config.environment != None: filecontent += ('environment = ' + config.environment + '\n') if config.getenv != None: filecontent += ('getenv = ' + config.getenv + '\n') if config.arguments != None: filecontent += ('arguments = ' + config.arguments + '\n') if config.x509userproxy!=None: filecontent += ('x509userproxy = ' + config.x509userproxy + '\n\n') else: filecontent += ('x509userproxy = ' + config.proxyFile + '\n\n') #Added support for additional classAdds: for classAdd in config.additionalClassAds: name = classAdd[0] value = classAdd[1] filecontent += (name + ' = ' + value +'\n') for j in range(0, int(concurrencyLevel[k]), 1): filecontent += ('Initialdir = ' + dir1 + 'job' + str(loop) + '\n') filecontent += ('Queue\n\n') loop = loop + 1 for i in range(0, int(concurrencyLevel[k]), 1): dir2 = dir1 + 'job' + str(i) + '/' os.makedirs(dir2) ilog('Creating condor file %s:\n%s'%(filename, filecontent )) condorSubmitFile.write(filecontent) condorSubmitFile.close() # Need to figure out when we have all the glideins # Ask the glidekeeper object ilog('Now waiting until the thread retrieves enough glideins.') finished = "false" while finished != "true": errors=[] while 1: # since gktid runs in a different thread, pop is the only atomic operation I have try: errors.append(gktid.errors.pop()) except IndexError: break errors.reverse() if not len(errors) == 0: ilog('Have errors!') for err in errors: main_log.write("%s Error: %s\n"%(ctime(err[0]),err[1])) ilog('Found an error: %s'%err[1]) if not gktid.isAlive(): raise RuntimeError, "The glidekeeper thread unexpectedly died!" numberGlideins = gktid.get_running_glideins() ilog('Currently have %s running glideins out of %s.'%(numberGlideins, requestedGlideins)) main_log.write("%s %s %s %s %s\n"%(ctime(), 'we have', numberGlideins, 'glideins, need', requestedGlideins)) main_log.flush() sleep(5) if numberGlideins >= requestedGlideins: finished = "true" # Now we begin submission and monitoring ilog('Got the glideins. Now submitting %s.'%filename) submission = condorManager.condorSubmitOne(filename) main_log.write("%s %s\n"%(ctime(), "file submitted")) running = "true" while running != "false": ilog('Running condorQ to get the running jobs.') check1 = condorMonitor.CondorQ() try: # i actually want to see all jos, not only running ones check1.load('(JobStatus<3)&&(GK_InstanceId=?="%s")&&(GK_SessionId=?="%s")'%(gktid.glidekeeper_id,gktid.session_id), [("JobStatus","s")]) data=check1.fetchStored() ilog('Success!') except RuntimeError,e: main_log.write("%s %s\n"%(ctime(), "condor_q failed (%s)... ignoring for now"%e)) main_log.flush() sleep(2) continue # retry the while loop except:
def make_submit_file_content(config,gktid,main_log,workingDir,concurrency, run): universe = 'vanilla' transfer_executable = "True" when_to_transfer_output = "ON_EXIT_OR_EVICT" # disable the check for architecture, we are running a script # only match to our own glideins requirements = '(Arch =!= "fake")&&(%s)'%gktid.glidekeeper_constraint owner = 'Undefined' notification = 'Never' logfile = workingDir + '/con_' + concurrency + '_run_' + str(run) + '.log' outputfile = 'concurrency_' + concurrency + '.out' errorfile = 'concurrency_' + concurrency + '.err' filecontent = ('universe = ' + universe + '\n' + 'executable = ' + config.executable + '\n' + 'transfer_executable = ' + transfer_executable + '\n' + 'when_to_transfer_output = ' + when_to_transfer_output + '\n' + 'Requirements = ' + requirements + '\n' + # '+Owner = ' + owner + '\n' + 'log = ' + logfile + '\n' + 'output = ' + outputfile + '\n' + 'error = ' + errorfile + '\n' + 'notification = ' + notification + '\n' + 'periodic_remove = ((JobStatus!=2)&&(JobRunCount>0))||(JobRunCount>1)\n' + '+GK_InstanceId = "' + gktid.glidekeeper_id + '"\n' + '+GK_SessionId = "' + gktid.session_id + '"\n' + '+IsSleep = 1\n') if config.inputFile != None: filecontent += ('transfer_input_files = ' + config.inputFile + '\n') if config.outputFile != None: filecontent += ('transfer_output_files = ' + config.outputFile + '\n') if config.environment != None: filecontent += ('environment = ' + config.environment + '\n') if config.getenv != None: filecontent += ('getenv = ' + config.getenv + '\n') if config.arguments != None: filecontent += ('arguments = ' + config.arguments + '\n') if config.x509userproxy!=None: filecontent += ('x509userproxy = ' + config.x509userproxy + '\n\n') elif config.pilotFile!=None: filecontent += ('x509userproxy = '+config.pilotFile + '\n\n') else: filecontent += ('x509userproxy = ' + config.proxyFile + '\n\n') #Added support for additional classAdds: for classAdd in config.additionalClassAds: name = classAdd[0] value = classAdd[1] filecontent += (name + ' = ' + value +'\n') # Now we create the directories for each job and a submit file config.verify_job_out_format() ilog('Using job output format: %s'%(config.jobOutFormat)) for i in range(0, int(concurrency), 1): args = { 'j' : str(i), 'c' : str(concurrency), 'wd' : str(workingDir), 'r' : str(run), 'sd' : os.getcwd(), 'ts' : startTime, } jobdir = construct_from_format(config.jobOutFormat, args) filecontent += ('Initialdir = ' +jobdir+ '\n') filecontent += ('Queue\n\n') os.makedirs(jobdir+'/') return filecontent
def soft_kill(self): ilog('Requesting a soft kill from the thread.') self.shutdown=True
def request_glideins(self,needed_glideins): ilog('Requesting %d glidens from thread.'%needed_glideins) self.needed_glideins=needed_glideins
# Deadvertize my add, so the factory knows we are gone for factory_pool in self.factory_pools: factory_pool_node=factory_pool[0] ilog('Deadvertising for node %s'%dbgp(factory_pool_node)) try: glideinFrontendInterface.deadvertizeAllWork(factory_pool_node,self.client_name) except RuntimeError, e: self.errors.append((time.time(),"Deadvertizing failed: %s"%e)) except: tb = traceback.format_exception(sys.exc_info()[0],sys.exc_info()[1], sys.exc_info()[2]) self.errors.append((time.time(),"Deadvertizing failed: %s"%string.join(tb,''))) # Stop all the glideins I can see ilog('Getting glidein pool status data.') try: pool_status=condorMonitor.CondorStatus() pool_status.load(self.glidekeeper_constraint,[('GLIDEIN_COLLECTOR_NAME','s'),('GLIDEIN_MASTER_NAME','s')]) pool_data=pool_status.fetchStored() except: self.errors.append((time.time(),"condor_status failed")) for k in pool_data.keys(): el=pool_data[k] ilog('Now killing pool with data: (%s -> %s)'%(dbgp(k), dbgp(el))) try: condorExe.exe_cmd("../sbin/condor_off","-master -pool %s %s"%(el['GLIDEIN_COLLECTOR_NAME'],el['GLIDEIN_MASTER_NAME'])) except RuntimeError, e: self.errors.append((time.time(),"condor_off failed: %s"%e)) except:
try: glideinFrontendInterface.deadvertizeAllWork( factory_pool_node, self.client_name) except RuntimeError, e: self.errors.append( (time.time(), "Deadvertizing failed: %s" % e)) except: tb = traceback.format_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]) self.errors.append( (time.time(), "Deadvertizing failed: %s" % string.join(tb, ''))) # Stop all the glideins I can see ilog('Getting glidein pool status data.') try: pool_status = condorMonitor.CondorStatus() pool_status.load(self.glidekeeper_constraint, [('GLIDEIN_COLLECTOR_NAME', 's'), ('GLIDEIN_MASTER_NAME', 's'), ('MyAddress', 's')]) pool_data = pool_status.fetchStored() except: self.errors.append((time.time(), "condor_status failed")) for k in pool_data.keys(): el = pool_data[k] ilog('Now killing pool with data: (%s -> %s)' % (dbgp(k), dbgp(el))) try:
def request_glideins(self, needed_glideins): ilog('Requesting %d glidens from thread.' % needed_glideins) self.needed_glideins = needed_glideins
check1.load('(JobStatus<3)&&(GK_InstanceId=?="%s")&&(GK_SessionId=?="%s")'%(gktid.glidekeeper_id,gktid.session_id), [("JobStatus","s")]) data=check1.fetchStored() ilog('Success!') except RuntimeError,e: main_log.write("%s %s\n"%(ctime(), "condor_q failed (%s)... ignoring for now"%e)) main_log.flush() sleep(2) continue # retry the while loop except: main_log.write("%s %s\n"%(ctime(), "condor_q failed (reason unknown)... ignoring for now")) main_log.flush() sleep(2) continue # retry the while loop ilog('Found %s jobs running.'%len(data.keys())) main_log.write("%s %s %s\n"%(ctime(), len(data.keys()), 'jobs running')) main_log.flush() if len(data.keys()) == 0: running = "false" main_log.write("%s %s\n"%(ctime(), "no more running jobs")) else: sleep(10) def parse_result(config,workingDir,concurrencyLevel): # Create a loop to parse each log file into a summaries directory summDir = workingDir + '/summaries/' os.makedirs(summDir) for l in range(0, config.runs, 1): for k in range(0, len(concurrencyLevel), 1):
def __init__(self,argv): # glideTester.cfg values self.runId=None self.glideinWMSDir = None self.configDir = None self.proxyFile = None self.pilotFile = None self.delegateProxy = None self.collectorNode = None self.gfactoryNode = None self.gfactoryConstraint = None self.gfactoryClassadID = None self.myClassadID = None self.mySecurityName = None # parameters.cfg values self.executable = None self.inputFile = None self.outputFile = None self.environment = None self.getenv = None self.arguments = None self.x509userproxy = None self.concurrencyLevel = None self.runs = None self.gfactoryAdditionalConstraint=None self.additionalClassAds = [] self.reuseOldGlideins = None self.jobOutFormat=None self.prescript = None self.postscript = None # parse arguments valid_keys = ['-config', '-cfg', '--config', '-params', '-runId'] arg_map = parse_argv(argv[1:], valid_kv_settings=valid_keys) passed_config_path = arg_map.get('-cfg') or arg_map.get('--config') or arg_map.get('-config') passed_params_path = arg_map.get('-params') self.cfg_paths = get_config_file_list(file_name='glideTester.cfg', arg_path=passed_config_path) self.params_path = get_config_file_list(file_name='parameters.cfg', arg_path=passed_params_path) self.runId = arg_map.get('-runId') # check and fix the attributes if self.runId==None: # not defined, create one specific for the account # should not be too random, or you polute the factory namespace self.runId="u%i"%os.getuid() # load external values self.load_cfg() self.verify_cfg() # set search path if self.glideinWMSDir is not None: sys.path.insert(0, self.glideinWMSDir) sys.path.insert(0,os.path.join(self.glideinWMSDir,"..")) self.load_config_dir() self.load_params() self.setup_logger() ilog("Made glideTester: \n\n%s\n"%dbgp(self, 4))