def removeRemoteFile(self, full_surl): cmd = '%s gfal-rm --verbose -t %d %s' % (self._setup, self.timeout, full_surl) tolog("Executing command: %s" % (cmd)) try: ec, rs = commands.getstatusoutput(cmd) except Exception, e: tolog("Warning: Exception caught in removeFile: %s" % (e))
def updateEventRanges(self, event_ranges): """ Update an event range on the Event Server """ pUtil.tolog("Updating event ranges..") message = "" #url = "" url = "" # eventRanges = [{'eventRangeID': '4001396-1800223966-4426028-1-2', 'eventStatus':'running'}, {'eventRangeID': '4001396-1800223966-4426028-2-2','eventStatus':'running'}] node={} node['eventRanges']=json.dumps(event_ranges) # open connection ret = pUtil.httpConnect(node, url, path=self.__updateEventRangesDir, mode="UPDATEEVENTRANGES") # response = json.loads(ret[1]) status = ret[0] if ret[0]: # non-zero return code message = "Failed to update event range - error code = %d, error: " % (ret[0], ret[1]) else: response = json.loads(json.dumps(ret[1])) status = int(response['StatusCode']) message = json.dumps(response['Returns']) return status, message
def finish(self): try: pUtil.tolog("Tell Event Stager to finish after finishing staging out all events") self.__canFinish = True self.renewEventStagerStatus() except: pUtil.tolog("Failed to monitor Event Stager: %s" % traceback.format_exc())
def core_get_data(self, envsetup, token, source_surl, dest_path, experiment): """ stage-in core function, can be overridden (see stormSiteMover) """ error = PilotErrors() # determine which timeout option to use timeout_option = "--connect-timeout 300 --max-time %d" % (self.timeout) sslCert = self.sslCert sslKey = self.sslKey sslCertDir = self.sslCertDir # used curl options: # --cert: <cert[:passwd]> Client certificate file and password (SSL) # --capath: <directory> CA directory (made using c_rehash) to verify # --location: Follow Location: hints (H) # --output: <file> Write output to <file> instead of stdout # --cilent: Makes Curl mute # --show-error: When used with -s it makes curl show error message if it fails # Removed for SL6: --ciphers <list of ciphers> (SSL) Specifies which ciphers to use in the connection. """ define curl command string """ _cmd_str = 'lcg-gt %s https' % (source_surl) try: s, o = commands.getstatusoutput(_cmd_str) tolog("Executing command: %s" % (_cmd_str)) except Exception, e: tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" % (str(e), s, o)) o = str(e)
def updateEventRange(self, event_range_id, status='finished'): """ Update an event range on the Event Server """ pUtil.tolog("Updating an event range..") message = "" # url = "" url = "" node = {} node['eventRangeID'] = event_range_id # node['cpu'] = eventRangeList[1] # node['wall'] = eventRangeList[2] node['eventStatus'] = status # tolog("node = %s" % str(node)) # open connection ret = pUtil.httpConnect(node, url, path=self.__updateEventRangesDir, mode="UPDATEEVENTRANGE") # response = ret[1] if ret[0]: # non-zero return code message = "Failed to update event range - error code = %d" % (ret[0]) else: message = "" return ret[0], message
def findVmPeaks(setup): """ Find the VmPeak values """ vmPeakMax = 0 vmPeakMean = 0 RSSMean = 0 # matched_lines = grep(["Py\:PerfMonSvc\s*INFO\s*VmPeak:\s*[0-9]"], stdout_filename) # pattern = "([0-9]+)" # # now extract the digits from the found lines # N = 0 # vmPeaks = 0 # for line in matched_lines: # _vmPeak =, line) # if _vmPeak: # N += 1 # vmPeak = # if vmPeak > vmPeakMax: # vmPeakMax = vmPeak # vmPeaks += vmPeak # use the VmPeak script to get all values cmd = "%s python >Pilot_VmPeak.txt" % (setup) try: ec, output = timedCommand(cmd, timeout=getProperTimeout(setup)) except Exception, e: tolog("!!WARNING!!1111!! Failed to execute VmPeak script: %s" % (e))
def updateOutputFilesXMLWithSURLs4NG(self, experiment, siteWorkdir, jobId, outputFilesXML): """ Update the OutputFiles.xml file with SURLs """ status = False # open and read back the OutputFiles.xml file _filename = os.path.join(siteWorkdir, outputFilesXML) if os.path.exists(_filename): try: f = open(_filename, "r") except Exception, e: tolog("!!WARNING!!1990!! Could not open file %s: %s" % (_filename, e)) else: # get the metadata xmlIN = f.close() # update the XML xmlOUT = updateXMLWithSURLs(experiment, xmlIN, siteWorkdir, jobId, self.__jobrec, format='NG') # write the XML try: f = open(_filename, "w") except OSError, e: tolog("!!WARNING!!1990!! Could not open file %s: %s" % (_filename, e)) else:
def getProcessCommands(euid, pids): """ return a list of process commands corresponding to a pid list for user euid """ _cmd = 'ps u -u %d' % (euid) processCommands = [] ec, rs = commands.getstatusoutput(_cmd) if ec != 0: pUtil.tolog("Command failed: %s" % (rs)) else: # extract the relevant processes pCommands = rs.split('\n') first = True for pCmd in pCommands: if first: # get the header info line processCommands.append(pCmd) first = False else: # remove extra spaces _pCmd = pCmd while " " in _pCmd: _pCmd = _pCmd.replace(" ", " ") items = _pCmd.split(" ") for pid in pids: # items = username pid ... if items[1] == str(pid): processCommands.append(pCmd) break return processCommands
def purgeFiles(path, filename, limit=12*3600): """ locate and remove lingering directories/files """ all_files = glob("%s/%s" % (path, filename)) max_files = 50 file_nr = 0 for _file in all_files: if file_nr >= max_files: break # when was the dir last modified? current_time = int(time.time()) try: file_modification_time = os.path.getmtime(_file) except: # skip this dir since it was not possible to read the modification time pass else: mod_time = current_time - file_modification_time if mod_time > limit: tolog("Found file %s last modified %d s ago (will now try to purge it)" % (_file, mod_time)) ec, rs = commands.getstatusoutput("rm -f %s" % (_file)) if ec != 0: tolog("Failed to remove dir: %s" % (rs)) file_nr += 1
def remove(self, site, job): """ Remove the job state file. Should only be called for finished jobs after the last server update. """ status = True # # get the file extension # extension = getExtension() # # # do not use self.filename in this case since this function is only # # used in pilot.cleanup() where self.filename has not been set # fileName = "%s/jobState-%s.%s" % (site.workdir, job.jobId, extension) # get the appropriate filename fileName = self.getFilename(site.workdir, job.jobId) if os.path.isfile(fileName): # remove the job state file try: os.system("rm -f %s" % fileName) except OSError: tolog("JOBSTATE FAILURE: Failed to remove job state file: %s" % fileName) status = False else: tolog("JOBSTATE FAILURE: Job state file does not exist: %s" % fileName) status = False return status
def addFullPathsAsInput(jobPars, full_paths_dictionary): """ Replace LFNs with full root paths """ # jobPars = .. --inputEVNTFile=EVNT.01416937._000003.pool.root,EVNT.01416937._000004.pool.root .. # -> # jobPars = .. --inputEVNTFile=root://../EVNT.01416937._000003.pool.root,root://../EVNT.01416937._000004.pool.root # FORMAT: full_paths_dictionary = { 'LFN1':'protocol://fullpath/LFN1', .. } # Extract the inputEVNTFile from the jobPars if "--inputEVNTFile" in jobPars: found_items = re.findall(r'\S+', jobPars) pattern = r"\'?\-\-inputEVNTFile\=(.+)\'?" for item in found_items: found = re.findall(pattern, item) if len(found) > 0: input_files = found[0] if input_files.endswith("\'"): input_files = input_files[:-1] if len(input_files) > 0: for lfn in input_files.split(','): if lfn in full_paths_dictionary.keys(): full_path = full_paths_dictionary[lfn]['pfn'] if full_path not in jobPars: jobPars = jobPars.replace(lfn, full_path) else: tolog("!!WARNING!!3435!! Did not find LFN=%s" % lfn) else: tolog( "!!WARNING!!3434!! Zero length list, cannot update LFN:s with full paths (remote I/O will not work)") return jobPars
def __init__(self, setup_path, *args, **kwrds): self._setup = setup_path self.copyCommand = 'aria2c' self.commandInPATH() rucio_account=self.rucio_account tolog("Rucio account: %s" %(rucio_account)) if rucio_account == "": tolog("!!FAILED!!2999!! Rucio account not set!") raise Exception("!!FAILED!!2999!! Rucio account not set!") cmd="curl -1 -i -H \"X-Rucio-Account: $RUCIO_ACCOUNT\" --cacert %s --cert %s --key %s --capath %s -X GET| grep 'X-Rucio-Auth-Token:'"%(self.sslKey,self.sslKey,self.sslKey,self.sslCertDir) tolog("Command to be launched: %s" %(cmd)) token_rucio_cmd=Popen(cmd,stdout=PIPE,stderr=PIPE, shell=True) token_rucio, stderr= token_rucio_cmd.communicate() if token_rucio: if '\r' in token_rucio: pos2print=token_rucio.find('\r') token_rucio=token_rucio[:pos2print] elif '\n' in token_rucio: pos2print=token_rucio.find('\n') pos2print=token_rucio.find("CN") token_rucio2print=token_rucio[:pos2print]+'(Hidden token)' tolog("Token on file: %s" %(token_rucio2print)) if os.path.exists('token_file'): os.remove('token_file') try: token_file=open('token_file', 'w') except IOError, e: tolog ("!!WARNING!! Failed to create file: %s"%(e)) raise Exception("!!FAILED!!1099!! Cannot create file for registering token!") else: token_file.write(token_rucio)
def addMD5sum(self, lfn, md5sum): """ add md5sum to lfn """ if os.environ.has_key('LD_LIBRARY_PATH'): tolog("LD_LIBRARY_PATH prior to lfc import: %s" % os.environ['LD_LIBRARY_PATH']) else: tolog("!!WARNING!!2999!! LD_LIBRARY_PATH not set prior to lfc import") import lfc os.environ['LFC_HOST'] = readpar('lfchost') # b="." # buffer = b.zfill(200) # ret = lfc.lfc_seterrbuf(buffer, len(buffer)) stat = lfc.lfc_filestatg() exitcode = lfc.lfc_statg(lfn, "", stat) if exitcode != 0: # print "error:",buffer err_num = lfc.cvar.serrno tolog("!!WARNING!!2999!! lfc.lfc_statg: %d %s" % (err_num, lfn)) return exitcode exitcode = lfc.lfc_setfsizeg(stat.guid, stat.filesize, 'MD', md5sum) if exitcode != 0: # print "error:",buffer err_num = lfc.cvar.serrno tolog("[Non-fatal] ERROR: lfc.lfc_setfsizeg: %d %s %s" % (err_num, lfn, md5sum)) return exitcode tolog("Successfully set md5sum for %s" % (lfn)) return exitcode
def downloadAllQueuenames(self): """ Download the entire schedconfig from AGIS """ ec = 0 # Do not even bother to download anything if JSON is not supported try: from json import load except: tolog("!!WARNING!!1231!! JSON is not available, cannot download schedconfig dump") ec = -1 else: # url = "" url = "" schedconfig_dump = self.getAllQueuedataFilename() cmd = "curl \'%s\' >%s" % (url, schedconfig_dump) if os.path.exists(schedconfig_dump): tolog("File %s already downloaded" % (schedconfig_dump)) else: tolog("Executing command: %s" % (cmd)) ec, out = commands.getstatusoutput(cmd) if ec != 0: tolog("!!WARNING!!1234!! Failed to download %s: %d, %s" % (schedconfig_dump, ec, out)) else: tolog("Downloaded schedconfig dump") return ec
def getSpecialAppdir(self, value): """ Get a special appdir depending on whether env variable 'value' exists """ ec = 0 _appdir = "" # does the directory exist? if os.environ.has_key(value): # expand the value in case it contains further environmental variables _appdir = os.path.expandvars(os.environ[value]) tolog("Environment has variable $%s = %s" % (value, _appdir)) if _appdir == "": tolog("!!WARNING!!2999!! Environmental variable not set: %s" % (value)) ec = self.__error.ERR_SETUPFAILURE else: # store the evaluated symbol in appdir if self.replaceQueuedataField('appdir', _appdir, verbose=False): tolog("Updated field %s in queuedata: %s" % ('appdir', _appdir)) else: tolog("!!WARNING!!2222!! Queuedata field could not be updated, cannot continue") ec = self.__error.ERR_SETUPFAILURE else: tolog("!!WARNING!!2220!! Environmental variable %s is not defined" % (value)) return ec, _appdir
def stageInFile(self, source, destination, sourceSize, sourceChecksum, guid=None): """StageIn the file. should be implementated by different site mover.""" statusRet = 0 outputRet = {} outputRet["errorLog"] = None outputRet["report"] = {} outputRet["report"]["clientState"] = None # build the parameters _params = "" if sourceSize != 0 and sourceSize != "0": _params += self.__par_filesize % (sourceSize) if sourceChecksum and sourceChecksum != 'None' and sourceChecksum != 0 and sourceChecksum != "0" and not self.isDummyChecksum(sourceChecksum): csumtype = self.getChecksumType(sourceChecksum) # special case for md5sum (command only understands 'md5' and 'adler32', and not 'ad' and 'md5sum') if csumtype == 'md5sum': csumtype = 'md5' _params += self.__par_checksum % ("%s:%s" % (csumtype, sourceChecksum),) # add the guid option _params += " --guid %s" % (guid) self.log("StageIn files started.") _cmd_str = self.__localget % (self._setup, _params, source, destination) self.log('Executing command: %s' % (_cmd_str)) s = -1 o = '(not defined)' t0 = os.times() outputRet["report"]['relativeStart'] = time() outputRet["report"]['transferStart'] = time() try: timerCommand = TimerCommand(_cmd_str) s, o = except Exception, e: tolog("!!WARNING!!2990!! Exception caught by stageInFile(): %s" % (str(e))) o = str(e)
def getTier1Queue2(self, cloud): """ Download the queuedata for the Tier-1 in the corresponding cloud and get the queue name """ queuename = "" path = self.getTier1InfoFilename() ec = self.downloadTier1Info() if ec == 0: # Process the downloaded T-1 info f = open(path, 'r') if getExtension() == "json": from json import loads data = loads( else: from pickle import load data = load(f) f.close() # Extract the relevant queue info for the given cloud T1_info = [x for x in data if x['cloud']==cloud] # finally get the queue name if T1_info != []: info = T1_info[0] if info.has_key('PanDAQueue'): queuename = info['PanDAQueue'] else: tolog("!!WARNING!!1222!! Returned Tier-1 info object does not have key PanDAQueue: %s" % str(info)) else: tolog("!!WARNING!!1223!! Found no Tier-1 info for cloud %s" % (cloud)) return queuename
def sig2exc(sig, frm): """ signal handler """ error = PilotErrors() global failureCode, globalPilotErrorDiag, globalErrorCode globalPilotErrorDiag = "!!FAILED!!3000!! SIGTERM Signal %s is caught in child pid=%d!\n" % (sig, os.getpid()) tolog(globalPilotErrorDiag) if sig == signal.SIGTERM: globalErrorCode = error.ERR_SIGTERM elif sig == signal.SIGQUIT: globalErrorCode = error.ERR_SIGQUIT elif sig == signal.SIGSEGV: globalErrorCode = error.ERR_SIGSEGV elif sig == signal.SIGXCPU: globalErrorCode = error.ERR_SIGXCPU elif sig == signal.SIGBUS: globalErrorCode = error.ERR_SIGBUS elif sig == signal.SIGUSR1: globalErrorCode = error.ERR_SIGUSR1 else: globalErrorCode = error.ERR_KILLSIGNAL failureCode = globalErrorCode # print to stderr print >> sys.stderr, globalPilotErrorDiag raise SystemError(sig)
def fixStageInPath(self, path): """Fix the path""" if path[:3] == "srm" and '?SFN=' in path: self.log("Found SFN part in file path: %s" % (path)) elif path[:3] == "srm": try: hostname = path.split('/',3)[2] except Exception as e: self.log("'!!WARNING!!2999!! Could not extract srm protocol for replacement, keeping path variable as it is: %s (%s)' %\ (path, str(e))") else: # srm = 'srm://' srm = 'srm://' + hostname # does seopt contain any matching srm's? sematch = self.getSEMatchFromSEOpt(srm) if sematch != "": path = path.replace(srm, sematch) self.log("Replaced %s with %s (from seopt) in path: %s" % (srm, sematch, path)) else: se = readpar('se').split(",")[0] _dummytoken, se = self.extractSE(se) tolog("Using SE: %s" % (se)) path = path.replace(srm, se) self.log("Replaced %s with %s (from se) in path: %s" % (srm, se, path)) # add port number from se to getfile if necessary path = self.addPortToPath(se, path) return path
def getOutFilesGuids(outFiles, workdir): """ get the outFilesGuids from the PFC """ ec = 0 pilotErrorDiag = "" outFilesGuids = [] pfcFile = "%s/PoolFileCatalog.xml" % (workdir) # initialization: make sure the guid list has the same length as the file list for i in range (0, len(outFiles)): outFilesGuids.append(None) # make sure the PFC exists if os.path.isfile(pfcFile): from xml.dom import minidom xmldoc = minidom.parse(pfcFile) fileList = xmldoc.getElementsByTagName("File") for thisfile in fileList: gpfn = str(thisfile.getElementsByTagName("pfn")[0].getAttribute("name")) guid = str(thisfile.getAttribute("ID")) for i in range(0,len(outFiles)): if outFiles[i] == gpfn: outFilesGuids[i] = guid else: pilotErrorDiag = "PFC file does not exist: %s" % (pfcFile) tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag)) error = PilotErrors() ec = error.ERR_MISSINGPFC return ec, pilotErrorDiag, outFilesGuids
def stageIn(job, jobSite, analJob, pilot_initdir, pworkdir): """ perform the stage-in """ ec = 0 statusPFCTurl = None usedFAXandDirectIO = False # prepare the input files (remove non-valid names) if there are any ins, job.filesizeIn, job.checksumIn = RunJobUtilities.prepareInFiles(job.inFiles, job.filesizeIn, job.checksumIn) if ins: tolog("Preparing for get command") # get the file access info (only useCT is needed here) useCT, oldPrefix, newPrefix, useFileStager, directIn = getFileAccessInfo() # transfer input files tin_0 = os.times() ec, job.pilotErrorDiag, statusPFCTurl, job.filesWithoutFAX, job.filesWithFAX, usedFAXandDirectIO = \ mover.get_data(job, jobSite, ins, stageinretry, analysisJob=analJob, usect=useCT,\ pinitdir=pilot_initdir, proxycheck=False, inputDir=inputDir, workDir=pworkdir) if ec != 0: job.result[2] = ec tin_1 = os.times() job.timeStageIn = int(round(tin_1[4] - tin_0[4])) return job, ins, statusPFCTurl, usedFAXandDirectIO
def getValidBaseURLs(self, order=None): """ Return list of valid base URLs """ # if order is defined, return given item first # e.g. order= -> ['', ...] validBaseURLs = [] _validBaseURLs = ["",\ "",\ "",\ "",\ "",\ "",\ "",\ "",\ ""] if order: validBaseURLs.append(order) for url in _validBaseURLs: if url != order: validBaseURLs.append(url) else: validBaseURLs = _validBaseURLs tolog("getValidBaseURLs will return: %s" % str(validBaseURLs)) return validBaseURLs
def removeRedundantFiles(self, workdir): """ Remove redundant files and directories """ # List of files and directories to be removed from work directory prior to log file creation # Make sure that any large files or directories that are not wanted in the log file are included in this list dir_list = [ "buildJob*", "external", "fort.*", "home", "python", "share", "workdir", "*.py", "*.pyc", "*.root*", "JEM", "tmp*", "*.tmp", "*.TMP", "scratch", ] for _dir in dir_list: files = glob(os.path.join(workdir, _dir)) rc = remove(files) if not rc: tolog("IGNORE: Failed to remove redundant file(s): %s" % (files))
def getRemoteFileInfo(self, file): try: size, md5 = self.s3Objectstore.getRemoteFileInfo(file) except: tolog("Failed to get remote file information: %s" % (sys.exc_info()[1])) return None, None return size, md5
def getSecurityKey(self, privateKeyName, publicKeyName): """ Return the key pair """ keyName=privateKeyName + "_" + publicKeyName if keyName in self.__securityKeys.keys(): return self.__securityKeys[keyName] else: try: #import environment #env = environment.set_environment() sslCert = self.getSSLCertificate() sslKey = sslCert node={} node['privateKeyName'] = privateKeyName node['publicKeyName'] = publicKeyName #host = '%s:%s' % (env['pshttpurl'], str(env['psport'])) # The key pair is not set on other panda server host = '' path = '/server/panda/getKeyPair' conn = httplib.HTTPSConnection(host,key_file=sslKey, cert_file=sslCert) conn.request('POST',path,urllib.urlencode(node)) resp = conn.getresponse() data = conn.close() dic = cgi.parse_qs(data) if dic["StatusCode"][0] == "0": self.__securityKeys[keyName] = {"publicKey": dic["publicKey"][0], "privateKey": dic["privateKey"][0]} return self.__securityKeys[keyName] except: _type, value, traceBack = sys.exc_info() tolog("Failed to getKeyPair for (%s, %s)" % (privateKeyName, publicKeyName)) tolog("ERROR: %s %s" % (_type, value)) return {"publicKey": None, "privateKey": None}
def dumpValue(self, name, value): """ Print a value if not empty """ if value != "": tolog("%s = %s" % (name, value)) else: tolog("%s was not set by the batch system" % (name))
def addToJobMetrics(self, jobResult, path, jobId): """ Add the batch job and machine features to the job metrics """ jobMetrics = "" # jobMetrics += self.addFieldToJobMetrics("hs06", self.hs06) # jobMetrics += self.addFieldToJobMetrics("shutdowntime", self.shutdownTime) # jobMetrics += self.addFieldToJobMetrics("jobslots", self.jobSlots) # jobMetrics += self.addFieldToJobMetrics("phys_cores", self.physCores) # jobMetrics += self.addFieldToJobMetrics("log_cores", self.logCores) # jobMetrics += self.addFieldToJobMetrics("cpufactor_lrms", self.cpuFactorLrms) # jobMetrics += self.addFieldToJobMetrics("cpu_limit_secs_lrms", self.cpuLimitSecsLrms) # jobMetrics += self.addFieldToJobMetrics("cpu_limit_secs", self.cpuLimitSecs) # jobMetrics += self.addFieldToJobMetrics("wall_limit_secs_lrms", self.wallLimitSecsLrms) # jobMetrics += self.addFieldToJobMetrics("wall_limit_secs", self.wallLimitSecs) # jobMetrics += self.addFieldToJobMetrics("disk_limit_GB", self.diskLimitGB) # jobMetrics += self.addFieldToJobMetrics("jobstart_secs", self.jobStartSecs) # jobMetrics += self.addFieldToJobMetrics("mem_limit_MB", self.memLimitMB) # jobMetrics += self.addFieldToJobMetrics("allocated_CPU", self.allocatedCPU) # Get the max disk space used by the payload (at the end of a job) if jobResult == "finished" or jobResult == "failed" or jobResult == "holding": max_space = getMaxWorkDirSize(path, jobId) if max_space > 0L: jobMetrics += self.addFieldToJobMetrics("workDirSize", max_space) else: tolog("Will not add max space = %d to job metrics" % (max_space)) return jobMetrics
def checkSpecialEnvVars(self, sitename): """ Check special environment variables """ ec = 0 tolog("No special env var checks for site %s" % (sitename)) return ec
def deserialize(self, text_string): tolog("Job received: %s", text_string) try: self.__dict__ = deserialize(text_string) except: tolog(" received exception while converting json string to BalsamJob: " + str(sys.exc_info()[1])) raise self.executable = convert_unicode_string(self.executable) self.executable_args = convert_unicode_string(self.executable_args) self.input_url = convert_unicode_string(self.input_url) tmp_input_files = [] for i_file in self.input_files: tmp_input_files.append(convert_unicode_string(i_file)) self.input_files = tmp_input_files self.output_url = convert_unicode_string(self.output_url) tmp_output_files = [] for o_file in self.output_files: tmp_output_files.append(convert_unicode_string(o_file)) self.output_files = tmp_output_files self.preprocess = convert_unicode_string(self.preprocess) self.preprocess_args = convert_unicode_string(self.preprocess_args) self.postprocess = convert_unicode_string(self.postprocess) self.postprocess_args = convert_unicode_string(self.postprocess_args) self.scheduler_args = convert_unicode_string(self.scheduler_args) self.condor_job_file = convert_unicode_string(self.condor_job_file) self.condor_dagman_file = convert_unicode_string(self.condor_dagman_file) self.target_site = convert_unicode_string(self.target_site)
def getSubprocessName(self, eventService): """ Select which subprocess is to be run by the Monitor """ # The default subprocess is RunJob (name='Normal', which performs payload setup, stage-in, payload execution and stage-out). # An alternative subprocess is the runEvent module which downloads events from an Event Server, executes a payload # and stages ou output files asynchronously as they are ready. # Note: send the entire job object to this method since there might be other subprocesses created at a later time which # will be identified by this method using some other job data member # Default subprocess name name = "RunJob" # Select alternative subprocess names for HPCs isHPC, _name = extractHPCInfo(readpar('catchall')) if isHPC: name = "RunJob" + _name # e.g. "RunJobTitan" is the proper subprocess name for the Titan plug-in # for es merge jobs if _name == "Hpc": name = "RunJob" # Are we going to run an event service job? if eventService: tolog("Encountered an event service job") if isHPC: name = "RunJob%sEvent" % (_name) else: name = "RunJobEvent" tolog("Selected subprocess: %s" % (name)) return name
if len(ls) == 1: if "workDir" in ls: ec, rs = commands.getstatusoutput("ls -lF %s" % (_dir)) tolog("ls: %s" % str(rs)) tolog( "Found single workDir: %s (will now purge it)" % (_dir)) ec, rs = commands.getstatusoutput("rm -rf %s" % (_dir)) if ec != 0: tolog("Failed to remove dir: %s" % (rs)) else: purged_nr += 1 dir_nr += 1 tolog("Purged %d single workDirs directories" % (purged_nr)) purgeWorkDirs = staticmethod(purgeWorkDirs) def purgeFiles(path, filename, limit=12 * 3600): """ locate and remove lingering directories/files """ all_files = glob("%s/%s" % (path, filename)) max_files = 50 file_nr = 0 for _file in all_files: if file_nr >= max_files: break # when was the dir last modified?
def getFileTransferInfo(self, transferType, buildJob): """ Get all relevant fields related to file transfer """ copysetup = readpar('copysetupin') # create the direct access dictionary fileTransferInfo = getDirectAccessDic(copysetup) # if copysetupin did not contain direct access info, try the copysetup instead if not fileTransferInfo: copysetup = readpar('copysetup') fileTransferInfo = getDirectAccessDic(copysetup) # should the copytool be used? useCopyTool = False useFileStager = False useDirectAccess = False lfcHost = readpar('lfchost') oldPrefix = "" newPrefix = "" dInfo = None if fileTransferInfo: dInfo = True # no direct access / remote I/O, use standard copytool (copy-to-scratch) if fileTransferInfo['useCopyTool']: useCopyTool = True # do not set the LFC host for file stager if fileTransferInfo['useFileStager']: useFileStager = True if fileTransferInfo['directIn']: useDirectAccess = True oldPrefix = fileTransferInfo['oldPrefix'] newPrefix = fileTransferInfo['newPrefix'] # override settings for transferType direct if transferType == 'direct': useCopyTool = False useFileStager = False useDirectAccess = True if oldPrefix == "" and newPrefix == "": lfcHost = "" # should pilot create TURL based PFC? (not done here, but setup needs to be aware of it) # if dInfo and useDirectAccess and oldPrefix == "" and newPrefix == "": if (transferType == 'direct' or (useFileStager and useDirectAccess)) and ( oldPrefix == "" and newPrefix == "") and not buildJob: # if (transferType == 'direct' or (not useFileStager and useDirectAccess)) and (oldPrefix == "" and newPrefix == ""): usePFCTurl = True else: usePFCTurl = False # force usePFCTurl for all jobs if not buildJob and useDirectAccess: tolog("Forced usePFCTurl (reset old/newPrefix)") usePFCTurl = True oldPrefix = "" newPrefix = "" return dInfo, useCopyTool, useDirectAccess, useFileStager, oldPrefix, newPrefix, copysetup, usePFCTurl, lfcHost
class Experiment(object): # experiment = "generic" # String defining the experiment # private data members __experiment = "generic" # String defining the experiment __instance = None # Boolean used by subclasses to become a Singleton __error = PilotErrors() # PilotErrors object __doFileLookups = False # True for LFC based file lookups (basically a dummy data member here since singleton object is static) __cache = "" # Cache URL used e.g. by LSST # Required methods def __init__(self, *args, **kwargs): """ Default initialization """ # e.g. self.__errorLabel = errorLabel # self.experiment = kwargs.get('experiment') pass def getExperiment(self): """ Return a string with the experiment name """ # return self.experiment return self.__experiment def getJobExecutionCommand(self): """ Define and test the command(s) that will be used to execute the payload """ # E.g. cmd = "source <path>/; <path>/python <script>" cmd = "" return cmd def getFileLookups(self): """ Return the file lookup boolean """ return self.__doFileLookups def doFileLookups(self, doFileLookups): """ Update the file lookups boolean """ # Only implement this method if class really wants to update the __doFileLookups boolean # ATLAS wants to implement this, but not CMS # Method is used by Mover # self.__doFileLookups = doFileLookups pass def willDoAlternativeFileLookups(self): """ Should file lookups be done using alternative methods? """ # E.g. in the migration period where LFC lookups are halted in favour of other methods in the Rucio API # (for ATLAS), this method could be useful. See the usage in Mover::getReplicaDictionary() which is called # after Experiment::willDoFileLookups() defined above. The motivation is that direct LFC calls are not to be # used any longer by the pilot, and in the migration period the actual LFC calls will be done in the Rucio # API. Eventually this API will switch to alternative file lookups. return False def willDoFileLookups(self): """ Should (LFC) file lookups be done by the pilot or not? """ return self.__doFileLookups def willDoFileRegistration(self): """ Should (LFC) file registration be done by the pilot or not? """ return False def getFileCatalog(self): """ Return the default file catalog to use (e.g. for replica lookups) """ # See usage in # e.g. 'lfc://' return "" # Additional optional methods # These methods are optional and can be left as they are here, or modified according to special needs def verifyProxy(self, envsetup="", limit=None): """ Check for a valid voms/grid proxy longer than N hours """ # Use 'limit' to set required length tolog("(verifyProxy() is not implemented)") exitcode = 0 pilotErrorDiag = "" return exitcode, pilotErrorDiag def removeRedundantFiles(self, workdir): """ Remove redundant files and directories """ # List of files and directories to be removed from work directory prior to log file creation # Make sure that any large files or directories that are not wanted in the log file are included in this list dir_list = [ "buildJob*", "external", "fort.*", "home", "python", "share", "workdir", "*.py", "*.pyc", "*.root*", "JEM", "tmp*", "*.tmp", "*.TMP", "scratch", ] for _dir in dir_list: files = glob(os.path.join(workdir, _dir)) rc = remove(files) if not rc: tolog("IGNORE: Failed to remove redundant file(s): %s" % (files)) def getPayloadName(self, job): """ Set a suitable name for the payload stdout """ # The payload <name> gets translated into <name>_stdout.txt # which is the name of the stdout file produced by the payload execution # (essentially commands.getoutput("<setup>; <payload executable> [options] > <name>_stdout.txt")) # The job object can be used to create more precise stdout names (see e.g. the ATLASExperiment implementation) return "payload" def isOutOfMemory(self, **kwargs): """ Try to identify out of memory errors in the stderr/out """ return False def getNumberOfEvents(self, **kwargs): """ Return the number of events """ return 0 def specialChecks(self, **kwargs): """ Implement special checks here """ # Return False if fatal failure, otherwise return True # The pilot will abort if this method returns a False # On an HPC system, it might be good to skip certain checks (e.g. CVMFS, LFC, etc). Refer to schedconfig.resourcetype, set to 'hpc' on an HPC queue status = False tolog("No special checks for \'%s\'" % (self.experiment)) return True # obviously change this to 'status' once implemented def checkSpecialEnvVars(self, sitename): """ Check special environment variables """ ec = 0 tolog("No special env var checks for site %s" % (sitename)) return ec def setINDS(self, realDatasetsIn): """ Extract the dataset as set by pathena option --inDS and set the INDS environmental variable """ # Needed by pathena (move to ATLASExperiment later) inDS = "" for ds in realDatasetsIn: if "DBRelease" not in ds and ".lib." not in ds: inDS = ds break if inDS != "": tolog("Setting INDS env variable to: %s" % (inDS)) os.environ['INDS'] = inDS else: tolog("INDS unknown") def getValidBaseURLs(self, order=None): """ Return list of valid base URLs """ # if order is defined, return given item first # e.g. order= -> ['', ...] validBaseURLs = [] _validBaseURLs = ["",\ "",\ "",\ "",\ "",\ "",\ "",\ "",\ ""] if order: validBaseURLs.append(order) for url in _validBaseURLs: if url != order: validBaseURLs.append(url) else: validBaseURLs = _validBaseURLs tolog("getValidBaseURLs will return: %s" % str(validBaseURLs)) return validBaseURLs def downloadTrf(self, wgetCommand, jobTrf): """ Download the trf """ status = False pilotErrorDiag = "" cmd = "%s %s" % (wgetCommand, jobTrf) trial = 1 max_trials = 3 # try to download the trf a maximum of 3 times while trial <= max_trials: tolog("Executing command [Trial %d/%d]: %s" % (trial, max_trials, cmd)) ec, rets = commands.getstatusoutput(cmd) if not rets: rets = "(None)" if ec != 0: # Analyze exit code / output from futil import check_syserr check_syserr(ec, rets) pilotErrorDiag = "wget command failed: %d, %s" % (ec, rets) tolog("!!WARNING!!3000!! %s" % (pilotErrorDiag)) if trial == max_trials: tolog("!!FAILED!!3000!! Could not download trf: %s" % (rets)) status = False break else: tolog("Will try again after 60s..") from time import sleep sleep(60) else: pilotErrorDiag = "" tolog("wget command returned: %s" % (rets)) status = True break trial += 1 return status, pilotErrorDiag def getAnalysisTrf(self, wgetCommand, origTRF, pilot_initdir): """ Get the trf to be used for analysis jobs """ pilotErrorDiag = "" trfName = origTRF.split('/')[-1] tolog("trfName = %s" % (trfName)) origBaseURL = "" # Copy trf from pilot init dir if distributed with pilot code fname = os.path.join(pilot_initdir, trfName) status = False if os.path.exists(fname): from shutil import copy2 try: copy2(fname, os.getcwd()) except Exception, e: tolog( "!!WARNING!!2999!! Could not copy trf from pilot init dir: %s" % str(e)) else: tolog("Copied trf (%s) from pilot init dir" % (fname)) status = True # Download trf if not status: # verify the base URL for baseURL in self.getValidBaseURLs(): if origTRF.startswith(baseURL): origBaseURL = baseURL break if origBaseURL == "": pilotErrorDiag = "Invalid base URL: %s" % (origTRF) return self.__error.ERR_TRFDOWNLOAD, pilotErrorDiag, "" else: tolog("Verified the trf base URL: %s" % (origBaseURL)) # try to download from the required location, if not - switch to backup for baseURL in self.getValidBaseURLs(order=origBaseURL): trf = re.sub(origBaseURL, baseURL, origTRF) tolog("Attempting to download trf: %s" % (trf)) status, pilotErrorDiag = self.downloadTrf(wgetCommand, trf) if status: break if not status: return self.__error.ERR_TRFDOWNLOAD, pilotErrorDiag, "" tolog("Successfully downloaded trf") tolog("Changing permission of %s to 0755" % (trfName)) try: os.chmod(trfName, 0755) except Exception, e: pilotErrorDiag = "Failed to chmod %s: %s" % (trfName, str(e)) return self.__error.ERR_CHMODTRF, pilotErrorDiag, ""
# Optional def getSubprocess(self, cmd, stdout=None, stderr=None): """ Execute and return a subprocess """ process = None try: tolog("Executing command: %s" % (cmd)) if stdout and stderr: # use stdout/stdout file objects to redirect the stdout/stderr streams process = Popen(cmd, shell=True, stdout=stdout, stderr=stderr) else: process = Popen(cmd, shell=True) except Exception, e: tolog("!!WARNING!!2344!! Caught exception: %s" % (e)) else: tolog("Subprocess is running") return process # Optional def getJobExecutionCommand4EventService(self): """ Define and test the command(s) that will be used to execute the payload for the event service """ # E.g. cmd = ["source <path>/; <path>/python <script>"] # The command returned from this method is executed using subprocess.Popen() from the runEvent module # Note: this optional method only need to be defined in case the event service is to be used # As of March 2014, this is not yet functional or documented. # The actual command must be declared as a list since that is expected by Popen() cmd = [""]
class Cleaner: """ This class is used to clean up lingering old/lost jobs. The clean-up criteria is that for a found Panda job directory, if the pilotlog.txt has not been updated for at least <limit> hours, and if the job state is 'running' then the assumption is that the job was unexpectedly terminated and should be erased from disk. The class defines the clean-up limit, but overrides this value if set in schedconfig. The cleanup() method should be executed after queuedata has been downloaded and after job recovery (which might or might not be turned on). Usage: from Cleaner import Cleaner cleaner = Cleaner(limit=<limit>, path=<path>, uflag=<uflag>) ec = cleaner.cleanup() cleanup() will return True for a successful/performed cleanup, False otherwise. <path> should normally be thisSite.wntmpdir <limit> should be an integer > 0 [hours] <uflag> user flag needed to distinguish job type (an analysis pilot is not allowed to touch production job directories on some sites) """ def __init__(self, limit=12, path="/tmp", uflag=None): """ Default init with verification """ self.clean = True self.uflag = None # verify the clean-up limit _type = str(limit.__class__) if limit and _type.find('int') == -1: tolog("Trying to convert limit from type %s to int" % (_type)) try: limit = int(limit) except: tolog("Failed to convert, reset to default") limit = 12 if limit == 0: tolog("Clean-up limit set to zero (no clean-up will be done)") self.clean = False elif limit < 0 or not limit: limit = 12 tolog( "!!WARNING!!5500!! Clean-up limit out of bounds, reset to default: %d" % (limit)) self.limit = limit tolog("Cleaner initialized with clean-up limit: %d hours" % (self.limit)) # verify the clean-up path and set the uflag if necessary if self.clean: if not path: path = "/tmp" tolog("Requested path reset to default: %s" % (path)) if os.path.exists(path): self.path = path tolog( "Cleaner will scan for lost directories in verified path: %s" % (self.path)) if uflag: self.uflag = uflag else: tolog( "!!WARNING!!5500!! No such directory: %s (clean-up not possible)" % (path)) self.path = None self.clean = False def cleanup(self): """ execute the clean-up """ status = True number_of_cleanups = 0 if self.clean: tolog("Executing empty dirs clean-up, stage 1/5") Cleaner.purgeEmptyDirs(self.path) tolog("Executing work dir clean-up, stage 2/5") Cleaner.purgeWorkDirs(self.path) tolog("Executing maxed-out dirs clean-up, stage 3/5") Cleaner.purgeMaxedoutDirs(self.path) tolog("Executing AthenaMP clean-up, stage 4/5 <SKIPPED>") #files = ['AthenaMP_*', 'fifo_*', 'TokenExtractorChannel*', 'zmq_EventService*', 'asetup*', 'tmp*.pkl'] #for f in files: # Cleaner.purgeFiles(self.path, f, limit=48*3600) tolog("Executing PanDA Pilot dir clean-up, stage 5/5") JS = JobState() # grab all job state files in all work directories job_state_files = glob(self.path + "/Panda_Pilot_*/jobState-*.pickle") number_of_files = len(job_state_files) file_number = 0 max_cleanups = 30 tolog("Number of found job state files: %d" % (number_of_files)) if job_state_files: # loop over all found job state files for file_path in job_state_files: file_number += 1 if file_number > max_cleanups: tolog( "Maximum number of job recoveries exceeded for this pilot: %d" % (max_cleanups)) break tolog("Processing job state file %d/%d: %s" % (file_number, number_of_files, file_path)) current_time = int(time.time()) # when was file last modified? try: file_modification_time = os.path.getmtime(file_path) except: # skip this file since it was not possible to read the modification time pass else: # was the job state file updated longer than the time limit? (convert to seconds) mod_time = current_time - file_modification_time if mod_time > self.limit * 3600: tolog( "File was last modified %d seconds ago (proceed)" % (mod_time)) cmd = "whoami; ls -lF %s; ls -lF %s" % ( file_path, os.path.dirname(file_path)) tolog("Executing command: %s" % (cmd)) ec, rs = commands.getstatusoutput(cmd) if ec == 0: tolog("%s" % (rs)) else: tolog("!!WARNING!!2999!! %d, %s" % (ec, rs)) # open the job state file if JS.get(file_path): # decode the job state info _job, _site, _node, _recoveryAttempt = JS.decode( ) # add member if it doesn't exist (new Job version) try: _tmp = _job.prodSourceLabel except: _job.prodSourceLabel = '' if _job and _site and _node: # query the job state file for job information if _job.result[ 0] == 'running' or _job.result[ 0] == 'starting' or ( _job.result[0] == 'holding' and mod_time > 7 * 24 * 3600): if _job.result[0] == 'holding': tolog( "Job %s was found in %s state but has not been modified for a long time - will be cleaned up" % (_job.jobId, _job.result[0])) else: tolog( "Job %s was found in %s state - will be cleaned up" % (_job.jobId, _job.result[0])) tolog("Erasing directory: %s" % (_site.workdir)) cmd = "rm -rf %s" % (_site.workdir) try: ec, rs = commands.getstatusoutput( cmd) except: tolog( "!!WARNING!!5500!! Could not erase lost job workdir: %d, %s" % (ec, rs)) status = False break else: tolog("Lost job workdir removed") else: tolog("Job found in state: %s" % (_job.result[0])) else: tolog( "File was last modified %d seconds ago (skip)" % (mod_time)) else: tolog("No job state files were found, aborting clean-up") else: tolog("Clean-up turned off") status = False return status def purgeEmptyDirs(path): """ locate and remove empty lingering dirs """ all_dirs = glob("%s/Panda_Pilot_*" % (path)) max_dirs = 50 purged_nr = 0 dir_nr = 0 for _dir in all_dirs: if dir_nr >= max_dirs: break # when was the dir last modified? current_time = int(time.time()) try: file_modification_time = os.path.getmtime(_dir) except: # skip this dir since it was not possible to read the modification time pass else: mod_time = current_time - file_modification_time if mod_time > 2 * 3600: try: ls = listdir(_dir) except Exception, e: tolog("!!WARNING!!2999!! Exception caught: %s" % str(e)) else: if len(ls) == 0 or len(ls) == 1: if len(ls) == 0: tolog( "Found empty dir: %s (last modified %d s ago, will now purge it)" % (_dir, mod_time)) else: tolog( "Found empty dir: %s (last modified %d s ago, will now purge it, 1 sub dir: %s)" % (_dir, mod_time, ls[0])) ec, rs = commands.getstatusoutput("rm -rf %s" % (_dir)) if ec != 0: tolog("Failed to remove dir: %d, %s (belonging to user %d, pilot is run by user %d)" %\ (ec, rs, os.stat(_dir)[4], os.getuid())) else: purged_nr += 1 dir_nr += 1 tolog("Purged %d empty directories" % (purged_nr))
def cleanup(self): """ execute the clean-up """ status = True number_of_cleanups = 0 if self.clean: tolog("Executing empty dirs clean-up, stage 1/5") Cleaner.purgeEmptyDirs(self.path) tolog("Executing work dir clean-up, stage 2/5") Cleaner.purgeWorkDirs(self.path) tolog("Executing maxed-out dirs clean-up, stage 3/5") Cleaner.purgeMaxedoutDirs(self.path) tolog("Executing AthenaMP clean-up, stage 4/5 <SKIPPED>") #files = ['AthenaMP_*', 'fifo_*', 'TokenExtractorChannel*', 'zmq_EventService*', 'asetup*', 'tmp*.pkl'] #for f in files: # Cleaner.purgeFiles(self.path, f, limit=48*3600) tolog("Executing PanDA Pilot dir clean-up, stage 5/5") JS = JobState() # grab all job state files in all work directories job_state_files = glob(self.path + "/Panda_Pilot_*/jobState-*.pickle") number_of_files = len(job_state_files) file_number = 0 max_cleanups = 30 tolog("Number of found job state files: %d" % (number_of_files)) if job_state_files: # loop over all found job state files for file_path in job_state_files: file_number += 1 if file_number > max_cleanups: tolog( "Maximum number of job recoveries exceeded for this pilot: %d" % (max_cleanups)) break tolog("Processing job state file %d/%d: %s" % (file_number, number_of_files, file_path)) current_time = int(time.time()) # when was file last modified? try: file_modification_time = os.path.getmtime(file_path) except: # skip this file since it was not possible to read the modification time pass else: # was the job state file updated longer than the time limit? (convert to seconds) mod_time = current_time - file_modification_time if mod_time > self.limit * 3600: tolog( "File was last modified %d seconds ago (proceed)" % (mod_time)) cmd = "whoami; ls -lF %s; ls -lF %s" % ( file_path, os.path.dirname(file_path)) tolog("Executing command: %s" % (cmd)) ec, rs = commands.getstatusoutput(cmd) if ec == 0: tolog("%s" % (rs)) else: tolog("!!WARNING!!2999!! %d, %s" % (ec, rs)) # open the job state file if JS.get(file_path): # decode the job state info _job, _site, _node, _recoveryAttempt = JS.decode( ) # add member if it doesn't exist (new Job version) try: _tmp = _job.prodSourceLabel except: _job.prodSourceLabel = '' if _job and _site and _node: # query the job state file for job information if _job.result[ 0] == 'running' or _job.result[ 0] == 'starting' or ( _job.result[0] == 'holding' and mod_time > 7 * 24 * 3600): if _job.result[0] == 'holding': tolog( "Job %s was found in %s state but has not been modified for a long time - will be cleaned up" % (_job.jobId, _job.result[0])) else: tolog( "Job %s was found in %s state - will be cleaned up" % (_job.jobId, _job.result[0])) tolog("Erasing directory: %s" % (_site.workdir)) cmd = "rm -rf %s" % (_site.workdir) try: ec, rs = commands.getstatusoutput( cmd) except: tolog( "!!WARNING!!5500!! Could not erase lost job workdir: %d, %s" % (ec, rs)) status = False break else: tolog("Lost job workdir removed") else: tolog("Job found in state: %s" % (_job.result[0])) else: tolog( "File was last modified %d seconds ago (skip)" % (mod_time)) else: tolog("No job state files were found, aborting clean-up") else: tolog("Clean-up turned off") status = False return status
def __init__(self, limit=12, path="/tmp", uflag=None): """ Default init with verification """ self.clean = True self.uflag = None # verify the clean-up limit _type = str(limit.__class__) if limit and _type.find('int') == -1: tolog("Trying to convert limit from type %s to int" % (_type)) try: limit = int(limit) except: tolog("Failed to convert, reset to default") limit = 12 if limit == 0: tolog("Clean-up limit set to zero (no clean-up will be done)") self.clean = False elif limit < 0 or not limit: limit = 12 tolog( "!!WARNING!!5500!! Clean-up limit out of bounds, reset to default: %d" % (limit)) self.limit = limit tolog("Cleaner initialized with clean-up limit: %d hours" % (self.limit)) # verify the clean-up path and set the uflag if necessary if self.clean: if not path: path = "/tmp" tolog("Requested path reset to default: %s" % (path)) if os.path.exists(path): self.path = path tolog( "Cleaner will scan for lost directories in verified path: %s" % (self.path)) if uflag: self.uflag = uflag else: tolog( "!!WARNING!!5500!! No such directory: %s (clean-up not possible)" % (path)) self.path = None self.clean = False
def stageOutFile(self, source, destination, token=None, outputDir=None): """Stage out the file. Should be implementated by different site mover""" statusRet = 0 outputRet = {} outputRet["errorLog"] = None outputRet["report"] = {} outputRet["report"]["clientState"] = None # determine which timeout option to use timeout_option = "-t %d" % (self.timeout) #mkdir _cmd_str = '%s gfal-mkdir --verbose %s -p %s' % (self._setup, timeout_option, os.path.dirname(destination)) self.log("Executing command: %s" % (_cmd_str)) status, output = commands.getstatusoutput(_cmd_str) self.log("status: %s, output: %s" % (status, output.replace("\n"," "))) if status != 0: outputRet["errorLog"] = output outputRet["report"]["clientState"] = "ERR_MKDIR" return PilotErrors.ERR_MKDIR, outputRet # cleanup the SURL if necessary (remove port and srm substring) if token: # Special case for GROUPDISK (do not remove dst: bit before this stage, needed in several places) if "dst:" in token: token = token[len('dst:'):] tolog("Dropped dst: part of space token descriptor; token=%s" % (token)) if 'DATADISK' in token: token = "ATLASDATADISK" else: token = "ATLASGROUPDISK" tolog("Space token descriptor reset to: %s" % (token)) _cmd_str = '%s gfal-copy --verbose %s -D "SRM PLUGIN:TURL_PROTOCOLS=gsiftp" -S %s file:%s %s' % (self._setup, timeout_option, token, source, destination) else: # surl is the same as putfile _cmd_str = '%s gfal-copy --verbose %s -D "SRM PLUGIN:TURL_PROTOCOLS=gsiftp" file:%s %s' % (self._setup, timeout_option, source, destination) ec = -1 t0 = os.times() o = '(not defined)' if outputDir and outputDir.endswith("PilotMVOutputDir"): timeStart = time() outputFile = os.path.join(outputDir, os.path.basename(source)) mvCmd = "cp -f %s %s" % (source, outputFile) tolog("Executing command: %s" % (mvCmd)) lstatus, loutput = commands.getstatusoutput(mvCmd) if lstatus != 0: ec = lstatus o = loutput else: outputFileCmd = outputFile + ".gfalcmd" handle = open(outputFileCmd, 'w') handle.write(_cmd_str.replace(source, outputFile)) handle.close() tolog("Write command %s to %s" % (_cmd_str.replace(source, outputFile), outputFileCmd)) tolog("Waiting remote to finish transfer") o = "Remote timeout to transfer out file" while (time() - timeStart) < self.timeout: sleep(5) if os.path.exists(outputFile + ".gfalcmdfinished"): ec = 0 o = "Remote finished transfer" tolog(o) os.remove(outputFile + ".gfalcmdfinished") break if os.path.exists(outputFile + ".gfalcmdfailed"): ec = 0 o = "Remote finished transfer" tolog(o) os.remove(outputFile + ".gfalcmdfailed") break else: tolog("Executing command: %s" % (_cmd_str)) outputRet["report"]['relativeStart'] = time() outputRet["report"]['transferStart'] = time() try: ec, o = commands.getstatusoutput(_cmd_str) except Exception, e: tolog("!!WARNING!!2999!! gfal-copy threw an exception: %s" % (o)) o = str(e)
tolog(o) os.remove(outputFile + ".gfalcmdfailed") break else: tolog("Executing command: %s" % (_cmd_str)) outputRet["report"]['relativeStart'] = time() outputRet["report"]['transferStart'] = time() try: ec, o = commands.getstatusoutput(_cmd_str) except Exception, e: tolog("!!WARNING!!2999!! gfal-copy threw an exception: %s" % (o)) o = str(e) outputRet["report"]['validateStart'] = time() t1 = os.times() t = t1[4] - t0[4] tolog("Command finished after %f s" % (t)) tolog("ec = %d, output = %s" % (ec, o.replace("\n"," "))) if ec != 0: tolog("!!WARNING!!2990!! Command failed: %s" % (_cmd_str)) #check_syserr(ec, o) tolog('!!WARNING!!2990!! Stage Out failed: Status=%d Output=%s' % (ec, str(o.replace("\n"," ")))) status, output = self.errorToReport(o, t, source, stageMethod="stageOut") if status == PilotErrors.ERR_FILEEXIST: return status, output # check if file was partially transferred, if so, remove it _ec, removeOutput = self.removeRemoteFile(destination) if not _ec : self.log("Failed to remove file ") # i.e. do not retry stage-out
def errorToReport(self, errorOutput, timeUsed, fileName, stageMethod='stageIN'): status = 0 outputRet = {} outputRet["errorLog"] = None outputRet["report"] = {} outputRet["report"]["clientState"] = None if "File exists" in errorOutput or "SRM_FILE_BUSY" in errorOutput: pilotErrorDiag = "File already exist in the destination." tolog("!!WARNING!!2990!! %s" % (pilotErrorDiag)) #self.prepareReport('FILE_EXIST', report) outputRet["report"]["clientState"] = 'FILE_EXIST' outputRet["errorLog"] = pilotErrorDiag return PilotErrors.ERR_FILEEXIST, outputRet elif "Could not establish context" in errorOutput: pilotErrorDiag = "Could not establish context: Proxy / VO extension of proxy has probably expired" tolog("!!WARNING!!2990!! %s" % (pilotErrorDiag)) #self.prepareReport('CONTEXT_FAIL', report) outputRet["report"]["clientState"] = 'CONTEXT_FAIL' outputRet["errorLog"] = pilotErrorDiag return PilotErrors.ERR_NOPROXY, outputRet elif "globus_xio:" in errorOutput: pilotErrorDiag = "Globus system error: %s" % (errorOutput) self.log("Globus system error encountered") #self.prepareReport('GLOBUS_FAIL', report) outputRet["report"]["clientState"] = 'GLOBUS_FAIL' outputRet["errorLog"] = pilotErrorDiag return PilotErrors.ERR_GETGLOBUSSYSERR, outputRet elif "No space left on device" in errorOutput: pilotErrorDiag = "No available space left on local disk: %s" % (errorOutput) tolog("No available space left on local disk") #self.prepareReport('NO_SPACE', report) outputRet["report"]["clientState"] = 'NO_SPACE' outputRet["errorLog"] = pilotErrorDiag return PilotErrors.ERR_NOLOCALSPACE, outputRet elif "No such file or directory" in errorOutput: if "DBRelease" in fileName: pilotErrorDiag = "Missing DBRelease file: %s" % (fileName) tolog("!!WARNING!!2990!! %s" % (pilotErrorDiag)) #self.prepareReport('NO_DBREL', report) outputRet["report"]["clientState"] = 'NO_DBREL' outputRet["errorLog"] = pilotErrorDiag return PilotErrors.ERR_MISSDBREL, outputRet else: pilotErrorDiag = "No such file or directory: %s" % (fileName) tolog("!!WARNING!!2990!! %s" % (pilotErrorDiag)) #self.prepareReport('NO_FILE_DIR', report) outputRet["report"]["clientState"] = 'NO_FILE' outputRet["errorLog"] = pilotErrorDiag return PilotErrors.ERR_NOSUCHFILE, outputRet else: if timeUsed >= self.timeout: pilotErrorDiag = "Copy command self timed out after %d s" % (timeUsed) tolog("!!WARNING!!2990!! %s" % (pilotErrorDiag)) if stageMethod == "stageIN": #self.prepareReport('GET_TIMEOUT', report) outputRet["report"]["clientState"] = 'GET_TIMEOUT' outputRet["errorLog"] = pilotErrorDiag return PilotErrors.ERR_GETTIMEOUT, pilotErrorDiag else: #self.prepareReport('CP_TIMEOUT', report) outputRet["report"]["clientState"] = 'CP_TIMEOUT' outputRet["errorLog"] = pilotErrorDiag return PilotErrors.ERR_PUTTIMEOUT, outputRet else: if len(errorOutput) == 0: pilotErrorDiag = "Copy command returned error code %d but no output" % (s) else: pilotErrorDiag = errorOutput #self.prepareReport('COPY_ERROR', report) outputRet["report"]["clientState"] = 'COPY_ERROR' outputRet["errorLog"] = pilotErrorDiag if stageMethod == "stageIN": return PilotErrors.ERR_STAGEINFAILED, outputRet else: return PilotErrors.ERR_STAGEOUTFAILED, outputRet
import time from pUtil import tolog, convert, getSiteInformation, readpar def openFile(filename, mode): """ Open and return a file pointer for the given mode """ # Note: caller needs to close the file f = None if os.path.exists(filename): try: f = open(filename, mode) except IOError, e: tolog("!!WARNING!!2997!! Caught exception: %s" % (e)) else: tolog("!!WARNING!!2998!! File does not exist: %s" % (filename)) return f def getJSONDictionary(filename): """ Read a dictionary with unicode to utf-8 conversion """ dictionary = None from json import load f = openFile(filename, 'r') if f: try: dictionary = load(f) except Exception, e: tolog("!!WARNING!!2222!! Failed to load json dictionary: %s" % (e)) else:
def log(self, errorLog): tolog(errorLog)
def extractAppdir(self): """ Called by, runMain method """ tolog("CMSExperiment - extractAppdir - nothing to do") return 0, ""
def extractOutputFilesFromJSON(workDir, allowNoOutput): """ In case the trf has produced additional output files (spill-over), extract all output files from the jobReport """ # Note: ignore files with nentries = 0 output_files = [] guids = [] tolog("Extracting output files from jobReport") jobReport_dictionary = getJobReport(workDir) if jobReport_dictionary != {}: if jobReport_dictionary.has_key('files'): file_dictionary = jobReport_dictionary['files'] if file_dictionary.has_key('output'): output_file_list = file_dictionary['output'] for f_dictionary in output_file_list: if f_dictionary.has_key('subFiles'): subFiles_list = f_dictionary['subFiles'] for f_names_dictionary in subFiles_list: if f_names_dictionary.has_key('name') and f_names_dictionary.has_key('nentries'): # Only add the file is nentries > 0 if type(f_names_dictionary['nentries']) == int and f_names_dictionary['nentries'] > 0: output_files.append(f_names_dictionary['name']) # Also get the file guid if f_names_dictionary.has_key('file_guid'): guids.append(f_names_dictionary['file_guid']) else: tolog("!!WARNING!!1212!! Did not find any guid for this file: %s (will be generated)" % (f_names_dictionary['name'])) guids.append(None) else: # Only ignore the file if it is allowed to be ignored if not type(f_names_dictionary['nentries']) == int: tolog("!!WARNING!!4542!! nentries is not a number: %s" % str(f_names_dictionary['nentries'])) # Special handling for origName._NNN # origName._NNN are unmerged files dynamically produced by AthenaMP. Job definition doesn't # explicitly specify those names but only the base names, thus allowNoOutput contains only base names # in this case. We want to ignore origName._NNN when allowNoOutput=['origName'] from re import compile allowNoOutputEx = [compile(s+'\.?_\d+$') for s in allowNoOutput] if f_names_dictionary['name'] in allowNoOutput or any(patt.match(f_names_dictionary['name']) for patt in allowNoOutputEx): tolog("Ignoring file %s since nentries=%s" % (f_names_dictionary['name'], str(f_names_dictionary['nentries']))) else: tolog("Will not ignore empty file %s since file is not in allowNoOutput list" % (f_names_dictionary['name'])) output_files.append(f_names_dictionary['name']) # Also get the file guid if f_names_dictionary.has_key('file_guid'): guids.append(f_names_dictionary['file_guid']) else: tolog("!!WARNING!!1212!! Did not find any guid for this file: %s (will be generated)" % (f_names_dictionary['name'])) guids.append(None) else: tolog("No such key: name/nentries") else: tolog("No such key: subFiles") else: tolog("No such key: output") else: tolog("No such key: files") if len(output_files) == 0: tolog("No output files found in jobReport") else: tolog("Output files found in jobReport: %s" % (output_files)) return output_files, guids
def verifySwbase(self, appdir): """ Called by, check needed for handleQueuedata method """ tolog("CMSExperiment - verifySwbase - nothing to do") return 0
def interpretPayloadStdout(self, job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, failureCode): """ payload error handling """ # NOTE: Move away ATLAS specific info in this method, e.g. vmPeak stuff error = PilotErrors() #Mancinelli: moved it in experiment class method handleTrfExitcode #transExitCode = res[0]%255 tolog("Mancinellidebug: res = %s res[0] = %s" % (res, res[0])) # Get the proper stdout filename number_of_jobs = len(runCommandList) filename = getStdoutFilename(job.workdir, job.stdout, current_job_number, number_of_jobs) # Try to identify out of memory errors in the stderr out_of_memory = self.isOutOfMemory(job=job, number_of_jobs=number_of_jobs) failed = out_of_memory # failed boolean used below # A killed job can have empty output but still transExitCode == 0 no_payload_output = False installation_error = False if getstatusoutput_was_interrupted: if os.path.exists(filename): if os.path.getsize(filename) > 0: tolog( "Payload produced stdout but was interrupted (getstatusoutput threw an exception)" ) else: no_payload_output = True failed = True else: failed = True no_payload_output = True elif len( res[1] ) < 20: # protect the following comparison against massive outputs if res[1] == 'Undefined': failed = True no_payload_output = True elif failureCode: failed = True else: # check for installation error res_tmp = res[1][:1024] if res_tmp[ 0: 3] == "sh:" and '' in res_tmp and 'No such file or directory' in res_tmp: failed = True installation_error = True if res[0] or failed: #Mancinelli: all this common part with CMS? if failureCode: job.pilotErrorDiag = "Payload failed: Interrupt failure code: %d" % ( failureCode) # (do not set pilot error code) elif getstatusoutput_was_interrupted: raise Exception, "Job execution was interrupted (see stderr)" elif out_of_memory: job.pilotErrorDiag = "Payload ran out of memory" job.result[2] = error.ERR_ATHENAOUTOFMEMORY elif no_payload_output: job.pilotErrorDiag = "Payload failed: No output" job.result[2] = error.ERR_NOPAYLOADOUTPUT elif installation_error: job.pilotErrorDiag = "Payload failed: Missing installation" job.result[2] = error.ERR_MISSINGINSTALLATION elif res[0]: #Mancinelli: calling for experiment class method to manage transformation exit code job = self.handleTrfExitcode(job, res, error, filename) else: job.pilotErrorDiag = "Payload failed due to unknown reason (check payload stdout)" job.result[2] = error.ERR_UNKNOWN tolog("!!FAILED!!3000!! %s" % (job.pilotErrorDiag)) # handle non-zero failed job return code but do not set pilot error codes to all payload errors """ if transExitCode or failed: if failureCode: job.pilotErrorDiag = "Payload failed: Interrupt failure code: %d" % (failureCode) # (do not set pilot error code) elif getstatusoutput_was_interrupted: raise Exception, "Job execution was interrupted (see stderr)" elif out_of_memory: job.pilotErrorDiag = "Payload ran out of memory" job.result[2] = error.ERR_ATHENAOUTOFMEMORY elif no_payload_output: job.pilotErrorDiag = "Payload failed: No output" job.result[2] = error.ERR_NOPAYLOADOUTPUT elif installation_error: job.pilotErrorDiag = "Payload failed: Missing installation" job.result[2] = error.ERR_MISSINGINSTALLATION elif transExitCode: # Handle PandaMover errors if transExitCode == 176: job.pilotErrorDiag = "PandaMover staging error: File is not cached" job.result[2] = error.ERR_PANDAMOVERFILENOTCACHED elif transExitCode == 86: job.pilotErrorDiag = "PandaMover transfer failure" job.result[2] = error.ERR_PANDAMOVERTRANSFER else: # check for specific errors in athena stdout if os.path.exists(filename): e1 = "prepare 5 database is locked" e2 = "Error SQLiteStatement" _out = commands.getoutput('grep "%s" %s | grep "%s"' % (e1, filename, e2)) if 'sqlite' in _out: job.pilotErrorDiag = "NFS/SQLite locking problems: %s" % (_out) job.result[2] = error.ERR_NFSSQLITE else: job.pilotErrorDiag = "Job failed: Non-zero failed job return code: %d" % (transExitCode) # (do not set a pilot error code) else: job.pilotErrorDiag = "Job failed: Non-zero failed job return code: %d (%s does not exist)" % (transExitCode, filename) # (do not set a pilot error code) else: job.pilotErrorDiag = "Payload failed due to unknown reason (check payload stdout)" job.result[2] = error.ERR_UNKNOWN tolog("!!FAILED!!3000!! %s" % (job.pilotErrorDiag)) # set the trf diag error if res[2] != "": tolog("TRF diagnostics: %s" % (res[2])) job.exeErrorDiag = res[2] job.result[1] = transExitCode """ return job
def getCMSRunCommand(self, job, jobSite, trfName): from RunJobUtilities import updateCopysetups ec = 0 pilotErrorDiag = "" run_command = "" # get relevant file transfer info dInfo, useCopyTool, useDirectAccess, useFileStager, oldPrefix, newPrefix, copysetup, usePFCTurl, lfcHost =\ self.getFileTransferInfo(job.transferType, isBuildJob(job.outFiles)) # extract the setup file from copysetup (and verify that it exists) _copysetup = self.getSetupFromCopysetup(copysetup) tolog("copysetup = %s" % _copysetup) if _copysetup != "" and os.path.exists(_copysetup): run_command = 'source %s; ' % (_copysetup) # add the user proxy if os.environ.has_key('X509_USER_PROXY'): run_command += 'export X509_USER_PROXY=%s; ' % os.environ[ 'X509_USER_PROXY'] else: tolog( "Could not add user proxy to the run command (proxy does not exist)" ) """ strpars = job.jobPars cmdopt = shlex.split(strpars) parser = PassThroughOptionParser() parser.add_option('-a',\ dest='a',\ type='string') parser.add_option('-o',\ dest='o',\ type='string') parser.add_option('--inputFile',\ dest='inputFile',\ type='string') parser.add_option('--sourceURL',\ dest='sourceURL',\ type='string') parser.add_option('--jobNumber',\ dest='jobNumber',\ type='string') parser.add_option('--cmsswVersion',\ dest='cmsswVersion',\ type='string') parser.add_option('--scramArch',\ dest='scramArch',\ type='string') parser.add_option('--runAndLumis',\ dest='runAndLumis',\ type='string') (options,args) = parser.parse_args(cmdopt) paramsstring = '-a %s ' % options.a paramsstring += '--sourceURL %s ' % options.sourceURL paramsstring += '--jobNumber=%s ' % options.jobNumber paramsstring += '--cmsswVersion=%s ' % options.cmsswVersion paramsstring += '--scramArch=%s ' % options.scramArch paramsstring += "--inputFile='%s' " % options.inputFile paramsstring += "--runAndLumis='%s' " % options.runAndLumis paramsstring += '-o "%s" ' % options.o tolog("paramsstring = %s" % paramsstring) """ run_command += './%s %s' % (trfName, job.jobPars) return ec, pilotErrorDiag, run_command
def checkSpecialEnvVars(self, sitename): """ Called by """ tolog("CMSExperiment - checkSpecialEnvVars - nothing to do") return 0
def finishJob(self): try: self.__hpcManager.finishJob() except: tolog(sys.exc_info()[1]) tolog(sys.exc_info()[2]) # If payload leaves the input files, delete them explicitly if self.__job.inFiles: ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles) #if self.__output_es_files: # ec = pUtil.removeFiles("/", self.__output_es_files) errorCode = PilotErrors.ERR_UNKNOWN if self.__job.attemptNr < 4: errorCode = PilotErrors.ERR_ESRECOVERABLE #check HPC job status #if self.__hpcStatus: # self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed") if len(self.__eventRanges) == 0: tolog("Cannot get event ranges") self.failJob(0, errorCode, self.__job, pilotErrorDiag="Cannot get event ranges") # check whether all event ranges are handled tolog("Total event ranges: %s" % len(self.__eventRanges)) not_handled_events = self.__eventRanges.values().count('new') tolog("Not handled events: %s" % not_handled_events) done_events = self.__eventRanges.values().count('Done') tolog("Finished events: %s" % done_events) stagedOut_events = self.__eventRanges.values().count('stagedOut') tolog("stagedOut but not updated to panda server events: %s" % stagedOut_events) if done_events + stagedOut_events: errorCode = PilotErrors.ERR_ESRECOVERABLE if not_handled_events + stagedOut_events: tolog("Not all event ranges are handled. failed job") self.failJob( 0, errorCode, self.__job, pilotErrorDiag="Not All events are handled(total:%s, left:%s)" % (len(self.__eventRanges), not_handled_events + stagedOut_events)) dsname, datasetDict = self.getDatasets() tolog("dsname = %s" % (dsname)) tolog("datasetDict = %s" % (datasetDict)) # Create the output file dictionary needed for generating the metadata ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles( self.__job.outFiles, self.__job.logFile, self.__job.workdir, fullpath=True) if ec: # missing output file (only error code from prepareOutFiles) self.failJob(self.__job.result[1], ec, self.__job, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet) ec, job, outputFileInfo = self.createFileMetadata( [], self.__job, outsDict, dsname, datasetDict, self.__jobSite.sitename) if ec: self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) # Rename the metadata produced by the payload # if not pUtil.isBuildJob(outs): self.moveTrfMetadata(self.__job.workdir, self.__job.jobId) # Check the job report for any exit code that should replace the res_tuple[0] res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir) res = (res0, exitMsg, exitMsg) # Payload error handling ed = ErrorDiagnosis() job = ed.interpretPayload(self.__job, res, False, 0, self.__runCommandList, self.getFailureCode()) if job.result[1] != 0 or job.result[2] != 0: self.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag) self.__job = job job.jobState = "finished" job.setState([job.jobState, 0, 0]) job.jobState = job.result rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort(), final=True) tolog("Done") self.sysExit(self.__job)
def getJobExecutionCommand(self, job, jobSite, pilot_initdir): """ Define and test the command(s) that will be used to execute the payload """ # Input tuple: (method is called from RunJob*) # job: Job object # jobSite: Site object # pilot_initdir: launch directory of # # Return tuple: # pilot_error_code, pilot_error_diagnostics, job_execution_command, special_setup_command, JEM, cmtconfig # where # pilot_error_code : self.__error.<PILOT ERROR CODE as defined in PilotErrors class> (value should be 0 for successful setup) # pilot_error_diagnostics: any output from problematic command or explanatory error diagnostics # job_execution_command : command to execute payload, e.g. cmd = "source <path>/; <path>/python [options]" # special_setup_command : any special setup command that can be insterted into job_execution_command and is sent to stage-in/out methods # JEM : Job Execution Monitor activation state (default value "NO", meaning JEM is not to be used. See # cmtconfig : cmtconfig symbol from the job def or schedconfig, e.g. "x86_64-slc5-gcc43-opt" pilotErrorDiag = "" cmd = "" JEM = "NO" # Is it's an analysis job or not? isCMSRunJob = self.isCMSRunJob(job.trf) tolog("isCMSRunJob = %s " % isCMSRunJob) # Command used to download trf wgetCommand = 'wget' # Get the cmtconfig value cmtconfig = getCmtconfig(job.cmtconfig) if cmtconfig != "": tolog("cmtconfig: %s" % (cmtconfig)) # Set python executable ec, pilotErrorDiag, pybin = self.setPython() if ec == self.__error.ERR_MISSINGINSTALLATION: return ec, pilotErrorDiag, "", special_setup_cmd, JEM, cmtconfig # Define the job execution command if isCMSRunJob: # Try to download the analysis trf status, pilotErrorDiag, trfName = self.getAnalysisTrf( wgetCommand, job.trf, pilot_initdir) if status != 0: return status, pilotErrorDiag, "", special_setup_cmd, JEM, cmtconfig scramArchSetup = self.getScramArchSetupCommand(job) ec, pilotErrorDiag, cmdtrf = self.getCMSRunCommand( job, jobSite, trfName) cmd = "%s %s" % (scramArchSetup, cmdtrf) # Set special_setup_cmd if necessary special_setup_cmd = self.getSpecialSetupCommand() if special_setup_cmd != "": tolog("Special setup command: %s" % (special_setup_cmd)) # Pipe stdout/err for payload to files cmd += " 1>%s 2>%s" % (job.stdout, job.stderr) tolog("\nCommand to run the job is: \n%s" % (cmd)) return 0, pilotErrorDiag, cmd, special_setup_cmd, JEM, cmtconfig
class RunJobHpcEvent(RunJob): # private data members __runjob = "RunJobHpcEvent" # String defining the sub class __instance = None # Boolean used by subclasses to become a Singleton #__error = PilotErrors() # PilotErrors object # Required methods def __init__(self): """ Default initialization """ # e.g. self.__errorLabel = errorLabel pass self.__output_es_files = [] self.__eventRanges = {} self.__failedStageOuts = [] self._hpcManager = None def __new__(cls, *args, **kwargs): """ Override the __new__ method to make the class a singleton """ if not cls.__instance: cls.__instance = super(RunJob, cls).__new__(cls, *args, **kwargs) return cls.__instance def getRunJob(self): """ Return a string with the experiment name """ return self.__runjob def getRunJobFileName(self): """ Return the filename of the module """ return super(RunJobHpcEvent, self).getRunJobFileName() # def argumentParser(self): <-- see example in def allowLoopingJobKiller(self): """ Should the pilot search for looping jobs? """ # The pilot has the ability to monitor the payload work directory. If there are no updated files within a certain # time limit, the pilot will consider the as stuck (looping) and will kill it. The looping time limits are set # in (see e.g. loopingLimitDefaultProd) return False def setupHPCEvent(self): self.__jobSite = Site.Site() self.__jobSite.setSiteInfo(self.argumentParser()) ## For HPC job, we don't need to reassign the workdir # reassign workdir for this job self.__jobSite.workdir = self.__jobSite.wntmpdir if not os.path.exists(self.__jobSite.workdir): os.makedirs(self.__jobSite.workdir) tolog("runJobHPCEvent.getPilotLogFilename=%s" % self.getPilotLogFilename()) if self.getPilotLogFilename() != "": pUtil.setPilotlogFilename(self.getPilotLogFilename()) # set node info self.__node = Node.Node() self.__node.setNodeName(os.uname()[1]) self.__node.collectWNInfo(self.__jobSite.workdir) # redirect stderr #sys.stderr = open("%s/runJobHPCEvent.stderr" % (self.__jobSite.workdir), "w") tolog("Current job workdir is: %s" % os.getcwd()) tolog("Site workdir is: %s" % self.__jobSite.workdir) # get the experiment object self.__thisExperiment = getExperiment(self.getExperiment()) tolog("runEvent will serve experiment: %s" % (self.__thisExperiment.getExperiment())) def getHPCEventJobFromPanda(self): pass def getHPCEventJobFromEnv(self): tolog("getHPCEventJobFromEnv") try: # always use this filename as the new jobDef module name import newJobDef job = Job.Job() job.setJobDef(newJobDef.job) job.coreCount = 0 job.workdir = self.__jobSite.workdir job.experiment = self.getExperiment() # figure out and set payload file names job.setPayloadName(self.__thisExperiment.getPayloadName(job)) # reset the default job output file list which is anyway not correct job.outFiles = [] except Exception, e: pilotErrorDiag = "Failed to process job info: %s" % str(e) tolog("!!WARNING!!3000!! %s" % (pilotErrorDiag)) self.failJob(0, PilotErrors.ERR_UNKNOWN, job, pilotErrorDiag=pilotErrorDiag) self.__job = job # prepare for the output file data directory # (will only created for jobs that end up in a 'holding' state) self.__job.datadir = self.getParentWorkDir() + "/PandaJob_%s_data" % ( job.jobId) # See if it's an analysis job or not trf = self.__job.trf self.__analysisJob = isAnalysisJob(trf.split(",")[0]) # Setup starts here ................................................................................ # Update the job state file self.__job.jobState = "starting" self.__job.setHpcStatus('init') # Send [especially] the process group back to the pilot self.__job.setState([self.__job.jobState, 0, 0]) self.__job.jobState = self.__job.result rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), runJob.getPilotPort()) self.__JR = JobRecovery(pshttpurl='', pilot_initdir=self.__job.workdir) self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test") self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) # prepare the setup and get the run command list ec, runCommandList, job, multi_trf = self.setup( self.__job, self.__jobSite, self.__thisExperiment) if ec != 0: tolog("!!WARNING!!2999!! runJob setup failed: %s" % (job.pilotErrorDiag)) self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) tolog("Setup has finished successfully") self.__job = job self.__runCommandList = runCommandList self.__multi_trf = multi_trf # job has been updated, display it again self.__job.displayJob() tolog("RunCommandList: %s" % self.__runCommandList) tolog("Multi_trf: %s" % self.__multi_trf)
def getExpSpecificMetadata(self, job, workdir): """ Return metadata extracted from jobReport.json""" fwjrMetadata = '' fwjrFile = os.path.join(workdir, "jobReport.json") tolog("Looking for jobReport file") if os.path.exists(fwjrFile): tolog("Found jobReport: %s" % fwjrFile) try: f = open(fwjrFile, 'r') for line in f.readlines(): fwjrMetadata += line except Exception, e: tolog("Failed to open jobReport file: %s" % str(e)) else: tolog("jobReport not found in %s " % fwjrFile) return fwjrMetadata def handleTrfExitcode(self, job, res, error, filename): transExitCode = res[0] #Mancinelli: TODO map CMS transformation error codes with error messages if transExitCode: # Handle PandaMover errors # Mancinelli: do we need this? if transExitCode == 176: job.pilotErrorDiag = "PandaMover staging error: File is not cached" job.result[2] = error.ERR_PANDAMOVERFILENOTCACHED elif transExitCode == 86: job.pilotErrorDiag = "PandaMover transfer failure" job.result[2] = error.ERR_PANDAMOVERTRANSFER
if status == 0: try: #self.updateEventRange(eventRangeID) self.__eventRanges[eventRangeID] = 'stagedOut' tolog("Remove staged out output file: %s" % output) os.remove(output) except Exception, e: tolog( "!!WARNING!!2233!! remove ouput file threw an exception: %s" % (e)) #self.__failedStageOuts.append(output_info) else: tolog("remove output file has returned") else: tolog( "!!WARNING!!1164!! Failed to upload file to objectstore: %d, %s" % (status, pilotErrorDiag)) self.__failedStageOuts.append(output_info) def getDefaultResources(self): siteInfo = getSiteInformation(self.getExperiment()) catchalls = siteInfo.readpar("catchall") values = {} for catchall in catchalls.split(","): if '=' in catchall: values[catchall.split('=')[0]] = catchall.split('=')[1] res = {} res['queue'] = values.get('queue', 'regular') res['mppwidth'] = values.get('mppwidth', 48) res['mppnppn'] = values.get('mppnppn', 1) res['walltime_m'] = values.get('walltime_m', 30)
def runHPCEvent(self): tolog("runHPCEvent") self.__job.jobState = "running" self.__job.setState([self.__job.jobState, 0, 0]) self.__job.pilotErrorDiag = None rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test") defRes = self.getDefaultResources() if defRes['copy_input_files'] == 'true': self.__copyInputFiles = True else: self.__copyInputFiles = False status, output, hpcJob = self.prepareHPCJob() if status == 0: tolog("HPC Job: %s " % hpcJob) else: tolog("failed to create the Tag file") self.failJob(0, PilotErrors.ERR_UNKNOWN, self.__job, pilotErrorDiag=output) return self.__hpcStatus = None self.__hpcLog = None logFileName = None tolog("runJobHPCEvent.getPilotLogFilename=%s" % self.getPilotLogFilename()) if self.getPilotLogFilename() != "": logFileName = self.getPilotLogFilename() hpcManager = HPCManager(globalWorkingDir=self.__job.workdir, logFileName=logFileName, poolFileCatalog=self.__poolFileCatalogTemp, inputFiles=self.__inputFilesGlobal, copyInputFiles=self.__copyInputFiles) self.__hpcManager = hpcManager self.HPCMode = "HPC_" + hpcManager.getMode(defRes) self.__job.setMode(self.HPCMode) self.__job.setHpcStatus('waitingResource') rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) hpcManager.getFreeResources(defRes) self.__job.coreCount = hpcManager.getCoreCount() self.__job.setHpcStatus('gettingEvents') rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) numRanges = hpcManager.getEventsNumber() tolog( "HPC Manager needs events: %s, max_events: %s; use the smallest one." % (numRanges, defRes['max_events'])) if numRanges > int(defRes['max_events']): numRanges = int(defRes['max_events']) eventRanges = self.getEventRanges(numRanges=numRanges) #tolog("Event Ranges: %s " % eventRanges) if len(eventRanges) == 0: tolog("Get no Event ranges. return") return for eventRange in eventRanges: self.__eventRanges[eventRange['eventRangeID']] = 'new' # setup stage out self.setupStageOutHPCEvent() hpcManager.initJob(hpcJob) hpcManager.initEventRanges(eventRanges) hpcManager.submit() threadpool = ThreadPool(defRes['stageout_threads']) old_state = None time_start = time.time() while not hpcManager.isFinished(): state = hpcManager.poll() self.__job.setHpcStatus(state) if old_state is None or old_state != state or time.time() > ( time_start + 60 * 10): old_state = state time_start = time.time() tolog("HPCManager Job stat: %s" % state) self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test") rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) if state and state == 'Complete': break outputs = hpcManager.getOutputs() for output in outputs: #self.stageOutHPCEvent(output) threadpool.add_task(self.stageOutHPCEvent, output) time.sleep(30) self.updateHPCEventRanges() tolog("HPCManager Job Finished") self.__job.setHpcStatus('stagingOut') rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) outputs = hpcManager.getOutputs() for output in outputs: #self.stageOutHPCEvent(output) threadpool.add_task(self.stageOutHPCEvent, output) self.updateHPCEventRanges() threadpool.wait_completion() self.updateHPCEventRanges() if len(self.__failedStageOuts) > 0: tolog("HPC Stage out retry 1") half_stageout_threads = defRes['stageout_threads'] / 2 if half_stageout_threads < 1: half_stageout_threads = 1 threadpool = ThreadPool(half_stageout_threads) failedStageOuts = self.__failedStageOuts self.__failedStageOuts = [] for failedStageOut in failedStageOuts: threadpool.add_task(self.stageOutHPCEvent, failedStageOut) threadpool.wait_completion() self.updateHPCEventRanges() if len(self.__failedStageOuts) > 0: tolog("HPC Stage out retry 2") threadpool = ThreadPool(1) failedStageOuts = self.__failedStageOuts self.__failedStageOuts = [] for failedStageOut in failedStageOuts: threadpool.add_task(self.stageOutHPCEvent, failedStageOut) threadpool.wait_completion() self.updateHPCEventRanges() self.__job.setHpcStatus('finished') self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) self.__hpcStatus, self.__hpcLog = hpcManager.checkHPCJobLog() tolog("HPC job log status: %s, job log error: %s" % (self.__hpcStatus, self.__hpcLog))
def setup(self, job, jobSite, thisExperiment): """ prepare the setup and get the run command list """ # start setup time counter t0 = time.time() ec = 0 # split up the job parameters to be able to loop over the tasks jobParameters = job.jobPars.split("\n")[0] jobTrf = job.trf.split("\n")[0] parser = optparse.OptionParser( description=' program to submit alpgen jobs like a pilot') parser.add_option('-p', '--process', dest='process', help='Alpgen Process, i.e. zjet, wjet, wqq, etc.') parser.add_option( '-n', '--nevts', dest='nevts', help= 'Number of weighted events requested in input file for weighted event generation', type='int') parser.add_option( '-g', '--group-id', dest='group_identifier', help= 'User specified string that helps the user group jobs together.') parser.add_option('-e', '--ecm', dest='ecm', help='Center of Mass Energy.') parser.add_option('-r', '--run-number', dest='run_number', help='Run Number') parser.add_option( '-c', '--jobConfig', dest='jobConfig', help= 'Job Options that will used from the Job Config tarball, i.e. MC12JobOptions/MC12.<Run Number>.<description>.py' ) parser.add_option( '-j', '--evgenJobOpts', dest='evgenJobOpts', help='Job Config tarball, i.e. MC12JobOpts-XX-YY-ZZ.tar.gz') parser.add_option('', '--dev', dest='dev', help='For development only.', action='store_true', default=False) parser.add_option( '-q', '--status-queue', dest='enable_status_queue', help= 'Enable the setting of the message queue parameter in the ArgoJob, which means ARGO will not send message updates for this job to the queue with its job ID.', action='store_true', default=False) #parser.add_option('-a','--warmup-evts',dest='warmup_evts',help='For Warmup Step: Three numbers seperated by commas giving the number of events per iteration, number of iterations, and final number of events to generate. Example: "10000,10,1000000"') parser.add_option( '-b', '--evtgen-evts', dest='evtgen_evts', help= 'For Event Generation Step: The number of events to generation in the event generation step. The ouput of unweighted events tends to be less so request more than you want. For example W+0jets gives you 70\%, W+1jet gives you 16%, W+2jet gives you 5%, W+3jet gives you 1%, and so on.', type='int') parser.add_option('-o', '--num-nodes', dest='numnodes', help='number of nodes to use on destination machine', type='int') parser.add_option( '-u', '--ranks-per-node', dest='ranks_per_node', help='number of MPI ranks per node to use on destination machine', type='int') parser.add_option( '-t', '--wall-time', dest='walltime', help='The wall time to submit to the queue in minutes.', type='int') parser.add_option( '-s', '--site', dest='site', help='Balsam site name on which to run the event generation') parser.add_option( '-x', '--no-submit', dest='submit', help='do not submit the message to ARGO. For testing purposes.', action='store_false', default=True) parser.add_option( '', '--wmp-evts-itr', dest='wm_evts_per_itr', help='Warmup: Number of weighted events per interation.') parser.add_option('', '--wmp-nitr', dest='wm_nitr', help='Warmup: Number of iterations') parser.add_option('', '--wmp-evts', dest='wm_evts', help='Warmup: Number of final events to produce.') try: options, args = parser.parse_args(shlex.split(jobParameters)) except: ec = self.__error.ERR_SETUPFAILURE job.pilotErrorDiag = "Failure to parse job arguments" tolog("Failure to parse job arguments for ARGO job") return ec, job tolog("ARGO job will be launched with next parameters: %s" % jobParameters) self.process = options.process self.username = '******' % job.prodUserID[:120] #os.environ['USER'] self.group_identifier = options.group_identifier self.ecm = options.ecm self.run_number = options.run_number self.job_config = options.jobConfig self.evgen_job_opts = options.evgenJobOpts self.warmup_phase0_number_events = options.wm_evts_per_itr self.warmup_phase0_number_iterations = options.wm_nitr self.warmup_phase1_number_events = options.wm_evts self.evtgen_phase1_number_events = options.evtgen_evts self.evtgen_nodes = options.numnodes self.evtgen_processes_per_node = options.ranks_per_node self.evtgen_wall_minutes = options.walltime self.parallel_site = = self.job_path = os.path.join(self.job_working_path, job.jobId) tolog("ARGO job path: %s" % self.job_path) self.argo_job = self.get_argo_job(job) if job.serial_site = 'argo_cluster_dev' # verify that the multi-trf job is setup properly os.chdir(jobSite.workdir) tolog("Current job workdir is %s" % os.getcwd()) job.timeSetup = int(time.time() - t0) tolog("Total setup time: %d s" % (job.timeSetup)) return ec, job
def prepareHPCJob(self): #print self.__runCommandList #print self.getParentWorkDir() #print self.__job.workdir # 1. input files inputFiles = [] inputFilesGlobal = [] for inputFile in self.__job.inFiles: #inputFiles.append(os.path.join(self.__job.workdir, inputFile)) inputFilesGlobal.append(os.path.join(self.__job.workdir, inputFile)) inputFiles.append(os.path.join('HPCWORKINGDIR', inputFile)) inputFileDict = dict(zip(self.__job.inFilesGuids, inputFilesGlobal)) self.__inputFilesGlobal = inputFilesGlobal tagFiles = {} EventFiles = {} for guid in inputFileDict: if '.TAG.' in inputFileDict[guid]: tagFiles[guid] = inputFileDict[guid] elif not "DBRelease" in inputFileDict[guid]: EventFiles[guid] = {} EventFiles[guid]['file'] = inputFileDict[guid] # 2. create TAG file for guid in EventFiles: inFiles = [EventFiles[guid]['file']] input_tag_file, input_tag_file_guid = self.createTAGFile( self.__runCommandList[0], self.__job.trf, inFiles, "") if input_tag_file != "" and input_tag_file_guid != "": tolog("Will run TokenExtractor on file %s" % (input_tag_file)) EventFiles[guid]['TAG'] = input_tag_file EventFiles[guid]['TAG_guid'] = input_tag_file_guid else: # only for current test if len(tagFiles) > 0: EventFiles[guid]['TAG_guid'] = tagFiles.keys()[0] EventFiles[guid]['TAG'] = tagFiles[tagFiles.keys()[0]] else: return -1, "Failed to create the TAG file", None # 3. create Pool File Catalog inputFileDict = dict(zip(self.__job.inFilesGuids, inputFilesGlobal)) self.__poolFileCatalog = os.path.join(self.__job.workdir, "PoolFileCatalog_HPC.xml") createPoolFileCatalog(inputFileDict, self.__job.inFiles, self.__poolFileCatalog) inputFileDictTemp = dict(zip(self.__job.inFilesGuids, inputFiles)) self.__poolFileCatalogTemp = os.path.join(self.__job.workdir, "PoolFileCatalog_Temp.xml") self.__poolFileCatalogTempName = "HPCWORKINGDIR/PoolFileCatalog_Temp.xml" createPoolFileCatalog(inputFileDictTemp, self.__job.inFiles, self.__poolFileCatalogTemp) # 4. getSetupCommand setupCommand = self.stripSetupCommand(self.__runCommandList[0], self.__job.trf) _cmd ='(source.+\;)', setupCommand) if _cmd: setup = source_setup = setup.split(";")[0] #setupCommand = setupCommand.replace(source_setup, source_setup + " --cmtextratags=ATLAS,useDBRelease") # for test, asetup has a bug #new_source_setup = source_setup.split("cmtsite/")[0] + "" #setupCommand = setupCommand.replace(source_setup, new_source_setup) tolog("setup command: " + setupCommand) # 5. AthenaMP command if not self.__copyInputFiles: jobInputFileList = None jobInputFileList = ','.join(inputFilesGlobal) #for inputFile in self.__job.inFiles: # jobInputFileList = ','.join(os.path.join(self.__job.workdir, inputFile)) # self.__runCommandList[0] = self.__runCommandList[0].replace(inputFile, os.path.join(self.__job.workdir, inputFile)) command_list = self.__runCommandList[0].split(" ") command_list_new = [] for command_part in command_list: if command_part.startswith("--input"): command_arg = command_part.split("=")[0] command_part_new = command_arg + "=" + jobInputFileList command_list_new.append(command_part_new) else: command_list_new.append(command_part) self.__runCommandList[0] = " ".join(command_list_new) self.__runCommandList[ 0] += ' --preExec \'from G4AtlasApps.SimFlags import simFlags;simFlags.RunNumber=222222;from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.Strategy="TokenScatterer";from AthenaCommon.AppMgr import ServiceMgr as svcMgr;from AthenaServices.AthenaServicesConf import OutputStreamSequencerSvc;outputStreamSequencerSvc = OutputStreamSequencerSvc();outputStreamSequencerSvc.SequenceIncidentName = "NextEventRange";outputStreamSequencerSvc.IgnoreInputFileBoundary = True;svcMgr += outputStreamSequencerSvc\' ' self.__runCommandList[ 0] += " '--skipFileValidation' '--checkEventCount=False' '--postExec' 'svcMgr.PoolSvc.ReadCatalog += [\"xmlcatalog_file:%s\"]'" % ( self.__poolFileCatalog) else: self.__runCommandList[ 0] += ' --preExec \'from G4AtlasApps.SimFlags import simFlags;simFlags.RunNumber=222222;from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.Strategy="TokenScatterer";from AthenaCommon.AppMgr import ServiceMgr as svcMgr;from AthenaServices.AthenaServicesConf import OutputStreamSequencerSvc;outputStreamSequencerSvc = OutputStreamSequencerSvc();outputStreamSequencerSvc.SequenceIncidentName = "NextEventRange";outputStreamSequencerSvc.IgnoreInputFileBoundary = True;svcMgr += outputStreamSequencerSvc\' ' self.__runCommandList[ 0] += " '--skipFileValidation' '--checkEventCount=False' '--postExec' 'svcMgr.PoolSvc.ReadCatalog += [\"xmlcatalog_file:%s\"]'" % ( self.__poolFileCatalogTempName) # should not have --DBRelease and in HPC self.__runCommandList[0] = self.__runCommandList[0].replace( "--DBRelease=current", "") if 'RecJobTransforms/,' in self.__runCommandList[0]: self.__runCommandList[0] = self.__runCommandList[0].replace( 'RecJobTransforms/,', '') if ',RecJobTransforms/' in self.__runCommandList[0]: self.__runCommandList[0] = self.__runCommandList[0].replace( ',RecJobTransforms/', '') if ' --postInclude=RecJobTransforms/ ' in self.__runCommandList[ 0]: self.__runCommandList[0] = self.__runCommandList[0].replace( ' --postInclude=RecJobTransforms/ ', ' ') #self.__runCommandList[0] = self.__runCommandList[0].replace(source_setup, source_setup + " --cmtextratags=ATLAS,useDBRelease --skipFileValidation --checkEventCount=False") # for tests, asetup has a bug #self.__runCommandList[0] = self.__runCommandList[0].replace(source_setup, new_source_setup) self.__runCommandList[ 0] += " 1>athenaMP_stdout.txt 2>athenaMP_stderr.txt" self.__runCommandList[0] = self.__runCommandList[0].replace(";;", ";") # 6. Token Extractor file list # in the token extractor file list, the guid is the Event guid, not the tag guid. self.__tagFile = os.path.join(self.__job.workdir, "TokenExtractor_filelist") handle = open(self.__tagFile, 'w') for guid in EventFiles: tagFile = EventFiles[guid]['TAG'] line = guid + ",PFN:" + tagFile + "\n" handle.write(line) handle.close() # 7. Token Extractor command setup = setupCommand self.__tokenExtractorCmd = setup + ";" + " TokenExtractor -v --source " + self.__tagFile + " 1>tokenExtract_stdout.txt 2>tokenExtract_stderr.txt" self.__tokenExtractorCmd = self.__tokenExtractorCmd.replace(";;", ";") # special case #self.__tokenExtractorCmd = "export LD_LIBRARY_PATH="+source_setup.split("cmtsite/")[0].strip().split(" ")[1]+"/patch/ldpatch/:$LD_LIBRARY_PATH; " + self.__tokenExtractorCmd return 0, None, { "TokenExtractCmd": self.__tokenExtractorCmd, "AthenaMPCmd": self.__runCommandList[0] }
def updateRunCommandList(runCommandList, pworkdir, jobId, statusPFCTurl, analysisJob, usedFAXandDirectIO, hasInput, prodDBlockToken): """ update the run command list if --directIn is no longer needed """ # the method is using the file state dictionary # remove later dumpFileStates(pworkdir, jobId, ftype="input") # remove any instruction regarding tag file creation for event service jobs _runCommandList = [] for cmd in runCommandList: if "--createTAGFileForES" in cmd: cmd = cmd.replace("--createTAGFileForES","") _runCommandList.append(cmd) runCommandList = _runCommandList # no need to continue if no input files if not hasInput: return runCommandList # are there only copy_to_scratch transfer modes in the file state dictionary? # if so, remove any lingering --directIn instruction only_copy_to_scratch = hasOnlyCopyToScratch(pworkdir, jobId) if only_copy_to_scratch or 'local' in prodDBlockToken: # if hasOnlyCopyToScratch(pworkdir, jobId): # python bug? does not work, have to use previous two lines? _runCommandList = [] if only_copy_to_scratch: tolog("There are only copy_to_scratch transfer modes in file state dictionary") for cmd in runCommandList: # remove the --directIn string if present if "--directIn" in cmd: tolog("(Removing --directIn instruction from run command since it is not needed)") cmd = cmd.replace("--directIn", "") # remove the --useFileStager string if present if "--useFileStager" in cmd: tolog("(Removing --useFileStager instruction from run command since it is not needed)") cmd = cmd.replace("--useFileStager", "") # remove additional run options if creation of TURL based PFC failed if statusPFCTurl == False: # (note: can also be None, so do not use 'if not statusPFCTurl') if "--usePFCTurl" in cmd: tolog("(Removing --usePFCTurl instruction from run command since it is not needed)") cmd = cmd.replace(" --usePFCTurl", "") if not "--lfcHost" in cmd and analysisJob: tolog("Adding lfcHost to run command") cmd += ' --lfcHost %s' % (readpar('lfchost')) tolog("Updated run command: %s" % (cmd)) _runCommandList.append(cmd) else: tolog("Nothing to update in run command list related to copy-to-scratch") _runCommandList = runCommandList # was FAX used as primary site mover in combination with direct I/O? if usedFAXandDirectIO == True: tolog("Since FAX was used as primary site mover in combination with direct I/O, the run command list need to be updated") _runCommandList2 = [] for cmd in _runCommandList: # remove the --lfcHost if "--lfcHost" in cmd: _lfcHost = ' --lfcHost %s' % (readpar('lfchost')) cmd = cmd.replace(_lfcHost, '') tolog("(Removed the LFC host:%s)" % (_lfcHost)) # remove the --oldPrefix if "--oldPrefix" in cmd: pattern = "(\-\-oldPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --oldPrefix pattern)") # remove the --newPrefix if "--newPrefix" in cmd: pattern = "(\-\-newPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --newPrefix pattern)") # add the --usePFCTurl if not there already if not "--usePFCTurl" in cmd and analysisJob: cmd += " --usePFCTurl" tolog("(Added --usePFCTurl)") tolog("Updated run command: %s" % (cmd)) _runCommandList2.append(cmd) _runCommandList = _runCommandList2 ### new movers quick integration: reuse usedFAXandDirectIO variable with special meaning ### to avoid any LFC and prefixes lookups in transformation scripts ### since new movers already form proper pfn values ### proper workflow is required: to be reimplemented later if usedFAXandDirectIO == 'newmover' or usedFAXandDirectIO == 'newmover-directaccess': tolog("updateRunCommandList(): use new movers logic") tolog("updateRunCommandList(): remove to be deprecated options (--lfcHost, --oldPrefix, --newPrefix) from command list") tolog("updateRunCommandList(): force to set --usePFCTurl") tolog("updateRunCommandList(): check directaccess mode if need (--directIn)") tolog("current runCommandList=%s" % _runCommandList) _runCommandList2 = [] for cmd in _runCommandList: # remove the --lfcHost, --oldPrefix, --newPrefix # add --usePFCTurl if "--lfcHost" in cmd: cmd = removePattern(cmd, "(\-\-lfcHost\ \S+)") tolog("(Removed the --lfcHost)") if "--oldPrefix" in cmd: pattern = "(\-\-oldPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --oldPrefix pattern)") if "--newPrefix" in cmd: pattern = "(\-\-newPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --newPrefix pattern)") if "--usePFCTurl" not in cmd and analysisJob: cmd += " --usePFCTurl" tolog("(Added --usePFCTurl)") # add --directIn if need if usedFAXandDirectIO == 'newmover-directaccess': if "--directIn" not in cmd and analysisJob: cmd += " --directIn" tolog("(Added --directIn)") tolog("Updated run command: %s" % cmd) _runCommandList2.append(cmd) _runCommandList = _runCommandList2 tolog("Dumping final input file states") dumpFileStates(pworkdir, jobId, ftype="input") return _runCommandList
def executePayload(self, thisExperiment, job): t0 = os.times() res_tuple = None # loop over all run commands (only >1 for multi-trfs) getstatusoutput_was_interrupted = False job_status = None tolog("About to launch ARGO job") # Poll MQ for Job Status try: # Initiate MQ interface and send job self.argo_job.job_status_routing_key = '%s_job_status' % job.jobId #'status_' + jobID si = SiteInformation() mi = MessageInterface() = '' mi.port = 5671 mi.ssl_cert = si.getSSLCertificate( ) #'/grid/atlas/hpc/pilot_certs/xrootdsrv-cert.pem' proxy_cert_path = si.getSSLCertificate() mi.ssl_cert = os.path.dirname( proxy_cert_path) + "/rabbitmq-cert.pem" if 'X509_USER_CERT' in os.environ.keys(): mi.ssl_cert = os.environ[ 'X509_USER_CERT'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-cert.pem' mi.ssl_key = mi.ssl_cert #'/grid/atlas/hpc/pilot_certs/xrootdsrv-key.pem' mi.ssl_key = os.path.dirname(proxy_cert_path) + "/rabbitmq-key.pem" if 'X509_USER_KEY' in os.environ.keys(): mi.ssl_key = os.environ[ 'X509_USER_KEY'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-key.pem' #mi.ssl_ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem" mi.ssl_ca_certs = '/grid/atlas/hpc/pilot_certs/cacerts.pem' #if 'X509_CA_CERTS' in os.environ.keys(): # mi.ssl_ca_certs = os.environ['X509_CA_CERTS'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/cacerts.pem' #tolog("CA certs: %s" % (mi.ssl_ca_certs)) ca_certs = os.path.dirname( proxy_cert_path) + "/rabbitmq-cacerts.pem" if os.path.isfile(ca_certs): mi.ssl_ca_certs = ca_certs mi.exchange_name = 'argo_users' #Create queue to get messages about ARGO Job status from MQ tolog('Opening connection with MQ') mi.open_blocking_connection() tolog('Create queue [%s] to retrieve messages with job status' % self.argo_job.job_status_routing_key) mi.create_queue(self.argo_job.job_status_routing_key, self.argo_job.job_status_routing_key) # submit ARGO job to MQ #tolog('Opening connection with MQ') #mi.open_blocking_connection() routing_key = 'argo_job' if routing_key = 'argo_job_dev' tolog('Sending msg with job to ARGO') mi.send_msg(self.argo_job.serialize(), routing_key) tolog(' done sending ') # Waiting till job done or failed ARGO_err_msg = '' while True: time.sleep(5) message = mi.receive_msg(self.argo_job.job_status_routing_key, True) if message[2]: tolog( "Got message from queue [%s]: method [%s], properties [%s], body [ %s ]" % (self.argo_job.job_status_routing_key, message[0], message[1], message[2])) job_status = ArgoJobStatus.get_from_message(message[2]) job.hpcStatus = job_status.state rt = RunJobUtilities.updatePilotServer( job, self.getPilotServer(), self.getPilotPort()) tolog("Extracted state: %s" % job_status.state) if job_status.state == job_status.HISTORY: res_tuple = (0, "Done") break elif job_status.is_failed(): res_tuple = (1, "Failed") ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message elif job_status.state == job_status.FAILED: res_tuple = (1, "Failed") ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message runJob.failJob(1, 0, job, ins=job.inFiles, pilotErrorDiag=ARGO_err_msg) break time.sleep(5) mi.close() tolog(' closing connection to MQ') tolog("Job State: %s" % (job_status.state)) #job.timeExe = int(fork_job.finished - fork_job.started) #################################################### except Exception, e: tolog("!!FAILED!!3000!! Failed to run command %s" % str(e)) getstatusoutput_was_interrupted = True res_tuple = (1, "Failed") self.failJob(0, self.__error.ERR_GENERALERROR, job, pilotErrorDiag=str(e))
def removeSkippedFromJobPars(fname, jobPars): """ remove skipped input files from jobPars """ # get the skipped file names from the xml skipped = getLFNsFromSkippedXML(fname) if skipped == []: tolog("Did not find any skipped LFNs in: %s" % (fname)) else: tolog("Removing skipped input files from jobPars") tolog("..skipped: %s" % str(skipped)) tolog("..jobPars:\n%s" % (jobPars)) for skip in skipped: tolog("..Removing: %s" % (skip)) # try difference styles _skip = "\'%s\'," % (skip) if _skip in jobPars: jobPars = jobPars.replace(_skip,'') tolog('..Removed %s from jobPars' % (_skip)) else: _skip = "\'%s\'" % (skip) if _skip in jobPars: jobPars = jobPars.replace(_skip,'') tolog('..Removed %s from jobPars' % (_skip)) else: _skip = "%s," % (skip) if _skip in jobPars: jobPars = jobPars.replace(skip,'') tolog('..Removed %s from jobPars' % (skip)) else: if skip in jobPars: jobPars = jobPars.replace(skip,'') print '..Removed %s from jobPars' % (skip) else: # nothing to remove tolog("..Found nothing to remove from jobPars: %s" % (jobPars)) return jobPars
tolog("Job State: %s" % (job_status.state)) #job.timeExe = int(fork_job.finished - fork_job.started) #################################################### except Exception, e: tolog("!!FAILED!!3000!! Failed to run command %s" % str(e)) getstatusoutput_was_interrupted = True res_tuple = (1, "Failed") self.failJob(0, self.__error.ERR_GENERALERROR, job, pilotErrorDiag=str(e)) else: if res_tuple[0] == 0: tolog("ARGO Job finished") else: tolog("ARGO Job failed: res = %s" % (str(res_tuple))) t1 = os.times() # CPU consumption metrics # t = map(lambda x, y:x-y, t1, t0) # get the time consumed # job.cpuConsumptionUnit, job.cpuConsumptionTime, job.cpuConversionFactor = pUtil.setTimeConsumed(t) # tolog("Job CPU usage: %s %s" % (job.cpuConsumptionTime, job.cpuConsumptionUnit)) # tolog("Job CPU conversion factor: %1.10f" % (job.cpuConversionFactor)) job.timeExe = int(round(t1[4] - t0[4])) tolog("Original exit code: %s" % (res_tuple[0])) if res_tuple[0] != None: tolog("Exit code: %s (returned from OS)" % (res_tuple[0] % 255)) res0 = res_tuple[0] % 255