def ReadQueueParams(): queuedata = "" verified = False si = SiteInformation() qdfname = si.getQueuedataFileName(check=False) # qdfname = "%s/queuedata.dat" % (os.environ['PilotHomeDir']) # qdfname = "%s/queuedata.json" % (os.environ['PilotHomeDir']) if os.path.exists(qdfname): print "queuedata already downloaded" verified, queuedata = readqueuedata(qdfname, first_trial=True) else: print "Downloading queuedata in atlasProdPilot" # download queuedata and verify it extension = utils.getExtension(alternative='dat') if extension == "json": _ext = extension else: _ext = "pilot" curl_url = 'http://pandaserver.cern.ch:25085/cache/schedconfig/%s.pilot.%s' % (qname, _ext) si = SiteInformation() sslCertificate = si.getSSLCertificate() cmd = 'curl --connect-timeout 20 --max-time 120 --cacert %s -sS "%s" > %s' % (sslCertificate, curl_url, qdfname) print "Executing command: %s" % (cmd) try: ret, rs = commands.getstatusoutput(cmd) except Exception, e: print "!!WARNING!!1999!! Failed with curl command: %s" % str(e) else:
def fixStageInPath(self, path): """Fix the path""" if path[:3] == "srm" and '?SFN=' in path: self.log("Found SFN part in file path: %s" % (path)) elif path[:3] == "srm": try: hostname = path.split('/',3)[2] except Exception as e: self.log("'!!WARNING!!2999!! Could not extract srm protocol for replacement, keeping path variable as it is: %s (%s)' %\ (path, str(e))") else: # srm = 'srm://head01.aglt2.org' srm = 'srm://' + hostname # does seopt contain any matching srm's? sematch = self.getSEMatchFromSEOpt(srm) if sematch != "": path = path.replace(srm, sematch) self.log("Replaced %s with %s (from seopt) in path: %s" % (srm, sematch, path)) else: se = readpar('se').split(",")[0] _dummytoken, se = self.extractSE(se) tolog("Using SE: %s" % (se)) path = path.replace(srm, se) self.log("Replaced %s with %s (from se) in path: %s" % (srm, se, path)) # add port number from se to getfile if necessary path = self.addPortToPath(se, path) siteInformation = SiteInformation() path = siteInformation.getCopyPrefixPath(path, stageIn=True) return path
def getStageInMode(self, lfn, prodDBlockToken): # should the root file be copied or read directly by athena? status = 0 output={} output["errorLog"] = None output["report"] = {} output["report"]["clientState"] = None output["transfer_mode"] = None isRootFileName = self.isRootFileName(lfn) siteInformation = SiteInformation() directIn, transfer_mode = siteInformation.getDirectInAccessMode(prodDBlockToken, isRootFileName) if transfer_mode: #updateFileState(lfn, workDir, jobId, mode="transfer_mode", state=transfer_mode, type="input") output["transfer_mode"] = transfer_mode if directIn: output["report"]["clientState"] = 'FOUND_ROOT' output["report"]['relativeStart'] = None output["report"]['transferStart'] = None return PilotErrors.ERR_DIRECTIOFILE, output return 0, output
def fixStageOutPath(self, path): """Fix the path""" statusRet = 0 outputRet = {} outputRet["errorLog"] = None outputRet["report"] = {} outputRet["report"]["clientState"] = None siteInformation = SiteInformation() cpt = siteInformation.getCopyTool(stageIn=False) tolog("Site mover will use get command: %s, %s" % (cpt)) # figure out which copyprefix to use (use the PFN to figure out where the file is and then use the appropriate copyprefix) # e.g. copyprefix=srm://srm-eosatlas.cern.ch,srm://srm-atlas.cern.ch^root://eosatlas.cern.ch/,root://castoratlas-xrdssl/ # PFN=srm://srm-eosatlas.cern.ch/.. use copyprefix root://eosatlas.cern.ch/ to build the TURL src_loc_pfn # full example: # Using copyprefixin = srm://srm-eosatlas.cern.ch,srm://srm-atlas.cern.ch^root://eosatlas.cern.ch/,root://castoratlas-xrdssl/ # PFN=srm://srm-eosatlas.cern.ch/eos/atlas/atlasdatadisk/rucio/mc12_8TeV/8d/c0/EVNT.01212395._000004.pool.root.1 # TURL=root://eosatlas.cern.ch//eos/atlas/atlasdatadisk/rucio/mc12_8TeV/8d/c0/EVNT.01212395._000004.pool.root.1 ret_path = siteInformation.getCopyPrefixPath(path, stageIn=False) if not ret_path.startswith("root:"): errorLog = "Failed to use copyprefix to convert the current path to local path." tolog("!!WARNING!!1777!! %s" % (errorLog)) outputRet["errorLog"] = errorLog outputRet["report"]["clientState"] = 'PSTAGE_FAIL' statusRet = PilotErrors.ERR_STAGEINFAILED tolog("PFN=%s" % (path)) tolog("TURL=%s" % (ret_path)) outputRet['path'] = ret_path return statusRet, outputRet
def getStageInMode(self, lfn, prodDBlockToken): # should the root file be copied or read directly by athena? status = 0 output = {} output["errorLog"] = None output["report"] = {} output["report"]["clientState"] = None output["transfer_mode"] = None isRootFileName = self.isRootFileName(lfn) siteInformation = SiteInformation() directIn, transfer_mode = siteInformation.getDirectInAccessMode( prodDBlockToken, isRootFileName) if transfer_mode: #updateFileState(lfn, workDir, jobId, mode="transfer_mode", state=transfer_mode, type="input") output["transfer_mode"] = transfer_mode if directIn: output["report"]["clientState"] = 'FOUND_ROOT' output["report"]['relativeStart'] = None output["report"]['transferStart'] = None return PilotErrors.ERR_DIRECTIOFILE, output return 0, output
def fixStageOutPath(self, path): """Fix the path""" statusRet = 0 outputRet={} outputRet["errorLog"] = None outputRet["report"] = {} outputRet["report"]["clientState"] = None siteInformation = SiteInformation() cpt = siteInformation.getCopyTool(stageIn=False) tolog("Site mover will use get command: %s, %s" % (cpt)) # figure out which copyprefix to use (use the PFN to figure out where the file is and then use the appropriate copyprefix) # e.g. copyprefix=srm://srm-eosatlas.cern.ch,srm://srm-atlas.cern.ch^root://eosatlas.cern.ch/,root://castoratlas-xrdssl/ # PFN=srm://srm-eosatlas.cern.ch/.. use copyprefix root://eosatlas.cern.ch/ to build the TURL src_loc_pfn # full example: # Using copyprefixin = srm://srm-eosatlas.cern.ch,srm://srm-atlas.cern.ch^root://eosatlas.cern.ch/,root://castoratlas-xrdssl/ # PFN=srm://srm-eosatlas.cern.ch/eos/atlas/atlasdatadisk/rucio/mc12_8TeV/8d/c0/EVNT.01212395._000004.pool.root.1 # TURL=root://eosatlas.cern.ch//eos/atlas/atlasdatadisk/rucio/mc12_8TeV/8d/c0/EVNT.01212395._000004.pool.root.1 ret_path = siteInformation.getCopyPrefixPath(path, stageIn=False) if not ret_path.startswith("root:"): errorLog = "Failed to use copyprefix to convert the current path to local path." tolog("!!WARNING!!1777!! %s" % (errorLog)) outputRet["errorLog"] = errorLog outputRet["report"]["clientState"] = 'PSTAGE_FAIL' statusRet = PilotErrors.ERR_STAGEINFAILED tolog("PFN=%s" % (path)) tolog("TURL=%s" % (ret_path)) outputRet['path'] = ret_path return statusRet, outputRet
def updateCopysetups(cmd3, transferType=None, useCT=None, directIn=None, useFileStager=None): """ Update the relevant copysetup fields for remote I/O or file stager """ si = SiteInformation() _copysetupin = readpar('copysetupin') _copysetup = readpar('copysetup') if _copysetupin != "": si.updateCopysetup(cmd3, 'copysetupin', _copysetupin, transferType=transferType, useCT=useCT, directIn=directIn, useFileStager=useFileStager) else: si.updateCopysetup(cmd3, 'copysetup', _copysetup, transferType=transferType, useCT=useCT, directIn=directIn, useFileStager=useFileStager)
def VerifyQueuedata(verified): """verify the information received from the server throught the queuedata info """ if not verified: print "Did not download queuedata from server" else: # is the site online? si = SiteInformation() status = si.readpar('status') if status.upper() == "OFFLINE": print "Site is %s, ignore" % (status.upper()) # sys.exit(2) else: print "Site is", status
def downloadscript(scripturl, sfile): """download the script with curl """ verified = False si = SiteInformation() sslCertificate = si.getSSLCertificate() cmd = "curl --connect-timeout 20 --max-time 120 --cacert %s %s -s -S -o %s" % (sslCertificate, scripturl, sfile) print cmd max_trials = 2 for trial in range(1, max_trials+1): st, out = commands.getstatusoutput(cmd) print "%s: %s" % (st, out) if st != 0: print "Error retrieving script with curl (attempt %d/%d)" % (trial, max_trials) else: cmd = "chmod +x %s; /bin/ls -al %s" % ( sfile, sfile ) print cmd stt, out = commands.getstatusoutput(cmd) print "%s: %s" % (stt, out) if stt == 0: if sfile.find('.tar.gz') > 0: print "Untarring", sfile cmd = "tar xzf %s" % sfile print cmd ec, out = commands.getstatusoutput(cmd) print out if ec != 0: print "tar failed, curl did not return valid archive (attempt %d/%d)" % (trial, max_trials) else: cmd = "chmod +x *.py *.sh" print cmd ec, out = commands.getstatusoutput(cmd) print ec, out verified = True break else: verified = True break else: print "chmod failed (attempt %d/%d)" % (trial, max_trials) if verified: break return verified
def ReportToMonitor(tpid, host): """Report to the monitor that this pilot ID is live """ if tpid != "?": print "Report pilot ID %s is live" % tpid si = SiteInformation() sslCertificate = si.getSSLCertificate() curl_url = 'http://panda.cern.ch:25980/server/pandamon/query?tpmes=setpilotlive&tpid=%s&host=%s' % (tpid,host) curl_cmd = 'curl --connect-timeout 20' \ + ' --max-time 120' \ + ' --cacert %s' % (sslCertificate) \ + ' -sS "%s"' % curl_url print curl_cmd curl_output = commands.getoutput(curl_cmd) print curl_output else: print "pilot ID is unknown"
def getWorkdir(): """obtaining the working directory """ si = SiteInformation() getworkdir = si.readpar('wntmpdir') ## If workdir contains env var, translate it pat = re.compile('(.*)(\$[^\/]+)(.*)') mat = pat.match(getworkdir) if mat: envvar = mat.group(2) envvar = envvar.replace('$','') if os.environ.has_key(envvar): envval = os.environ[envvar] getworkdir = "%s%s%s" % ( mat.group(1), envval, mat.group(3) ) print "Translated wntmpdir with env var to", getworkdir else: print "WARNING: wntmpdir contains env var %s that is undefined" % envvar return getworkdir
def getrunpars(queuedata, workdir, puser, countrygroup, allowothercountry, libcode): jobstat = 0 appdir = "" reldir = "" sitesetup = "" releases = "" datadir = "" dq2url = "" queue = os.environ['QueueName'] print "atlasProdPilot running on queue", queue if os.environ.has_key('PANDA_URL_SSL'): global baseURLSSL baseURLSSL = os.environ['PANDA_URL_SSL'] pat = re.compile('^(.*):([0-9]+)/.*$') mat = pat.match(baseURLSSL) if mat: pandaURL = mat.group(1) pandaPort = mat.group(2) else: print "!!FAILED!!6999!!Bad Panda URL %s" % baseURLSSL jobstat = 1 else: print "!!FAILED!!6999!!PANDA_URL_SSL undefined" jobstat = 1 # reading setup from queuedata try: # checking if queuedata is not None queuedata except: queuedata = "" else: si = SiteInformation() appdir = si.getpar('appdir',queuedata) datadir = si.getpar('datadir',queuedata) dq2url = si.getpar('dq2url',queuedata) par = si.getpar('gatekeeper',queuedata) if par != "": print "Setting ATLAS_CONDDB to", par os.environ["ATLAS_CONDDB"] = par print "Setting RUCIO_ACCOUNT to pilot" os.environ["RUCIO_ACCOUNT"] = 'pilot' if datadir == "": if os.environ.has_key("SCRATCH_DIRECTORY"): datadir = os.environ["SCRATCH_DIRECTORY"] elif os.environ.has_key("OSG_WN_TMP"): datadir = os.environ["OSG_WN_TMP"] else: print "!!WARNING!!2500!!Cannot locate scratch area" if datadir == "": print "datadir not defined" else: print "datadir:", datadir if dq2url == "": print "dq2url not defined" else: print "dq2url:", dq2url try: release = os.environ["swRelease"] release = release.replace('Atlas-','') print "Release:", release except: print "swRelease is not defined" release = "" runpars = [ '-s', site, '-h', queue, '-d', workdir, '-q', dq2url, '-f', 'true', '-w', pandaURL, '-p', pandaPort, '-l', 'true' ] print "Using source: %s" % (libcode) if "pilotcode-dev.tar.gz" in libcode: runpars.append('-C') runpars.append('0') print "Will turn off multi-jobs for dev pilot" if appdir != "": # Set environment variables the pilot wants os.environ["SITEROOT"]="%s/%s" % ( appdir, release ) runpars.append('-a') runpars.append(appdir) print "appdir:", appdir else: print "appdir not defined" if puser != "": runpars.append('-u') runpars.append(puser) if countrygroup != "": runpars.append('-o') runpars.append(countrygroup) if allowothercountry != "": runpars.append('-A') runpars.append(allowothercountry) return runpars, jobstat
_dir = '/etc/grid-security/certificates' if os.path.exists(_dir): sslCertificatesDirectory = _dir else: tolog("!!WARNING!!2999!! $X509_CERT_DIR is not set and default location %s does not exist" % (_dir)) return sslCertificatesDirectory def getProperPaths(self): """ Return proper paths for the storage element """ # Implement in sub-class return "" def getTier1Queue(self, cloud): """ Download the queuedata for the Tier-1 in the corresponding cloud and get the queue name """ # Implement in sub-class # This method is used during stage-out to alternative [Tier-1] site when primary stage-out on a Tier-2 fails # See methods in ATLASSiteInformation return None if __name__ == "__main__": from SiteInformation import SiteInformation import os os.environ['PilotHomeDir'] = os.getcwd() s1 = SiteInformation() print "copytool=",s1.readpar('copytool')
class curlSiteMover(SiteMover.SiteMover): """ SiteMover for curl """ copyCommand = "curl" checksum_command = "adler32" has_mkdir = False has_df = False has_getsize = False has_md5sum = True has_chmod = False timeout = 3600 """ get proxy """ si = SiteInformation() sslCert = si.getSSLCertificate() sslKey = sslCert sslCertDir = si.getSSLCertificatesDirectory() def __init__(self, setup_path, *args, **kwrds): self._setup = setup_path def get_timeout(self): return self.timeout def check_space(self, ub): """ For when space availability is not verifiable """ return 999999 def core_get_data(self, envsetup, token, source_surl, dest_path, experiment): """ stage-in core function, can be overridden (see stormSiteMover) """ error = PilotErrors() # determine which timeout option to use timeout_option = "--connect-timeout 300 --max-time %d" % (self.timeout) sslCert = self.sslCert sslKey = self.sslKey sslCertDir = self.sslCertDir # used curl options: # --cert: <cert[:passwd]> Client certificate file and password (SSL) # --capath: <directory> CA directory (made using c_rehash) to verify # --location: Follow Location: hints (H) # --output: <file> Write output to <file> instead of stdout # --cilent: Makes Curl mute # --show-error: When used with -s it makes curl show error message if it fails # Removed for SL6: --ciphers <list of ciphers> (SSL) Specifies which ciphers to use in the connection. """ define curl command string """ _cmd_str = 'lcg-gt %s https' % (source_surl) try: s, o = commands.getstatusoutput(_cmd_str) tolog("Executing command: %s" % (_cmd_str)) except Exception, e: tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" % (str(e), s, o)) o = str(e) if s == 0: tolog("lcg-gt supported, get http path") source_surl = o.strip().split() source_surl = source_surl[0] _cmd_str = '%s curl --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % ( envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey, source_surl, dest_path) # _cmd_str = '%s curl --ciphers ALL:NULL --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % (envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey, source_surl, dest_path) else: tolog( "lcg-gt not supported, get http path by replacing source_surl") _cmd_str = '%s curl --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % ( envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey, source_surl, dest_path) # _cmd_str = '%s curl --ciphers ALL:NULL --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % (envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey, source_surl, dest_path) _cmd_str = _cmd_str.replace("srm://", "https://") # add the full stage-out command to the job setup script #_cmd_str = _cmd_str.replace("file://", "-o ") # get the experiment object thisExperiment = getExperiment(experiment) to_script = _cmd_str to_script = to_script.lstrip(' ') # remove any initial spaces if to_script.startswith('/'): to_script = 'source ' + to_script thisExperiment.updateJobSetupScript(os.path.dirname(dest_path), to_script=to_script) tolog("Executing command: %s" % (_cmd_str)) s = -1 o = '(not defined)' t0 = os.times() try: s, o = commands.getstatusoutput(_cmd_str) except Exception, e: tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" % (str(e), s, o)) o = str(e)
surl = surl.replace("s3+rucio", "s3") if surl.startswith("root:"): sitemover = xrootdObjectstoreSiteMover(self.getSetup()) return sitemover. put_data(source, destination, fsize, fchecksum, **pdict) if surl.startswith("s3:"): sitemover = S3ObjectstoreSiteMover(self.getSetup(), self._useTimerCommand) return sitemover. put_data(source, surl, fsize, fchecksum, **pdict) return -1, "No objectstore sitemover found for this scheme(%s)" % destination, destination, fsize, fchecksum, config_sm.ARCH_DEFAULT if __name__ == '__main__': os.environ['PilotHomeDir'] = os.getcwd() from SiteInformation import SiteInformation s1 = SiteInformation() #s1.getObjectstoresField("os_access_key", "eventservice", queuename='BNL_EC2W2_MCORE') f = objectstoreSiteMover() gpfn = "nonsens_gpfn" lfn = "AOD.310713._000004.pool.root.1" path = os.getcwd() fsize = "4261010441" fchecksum = "9145af38" dsname = "data11_7TeV.00177986.physics_Egamma.merge.AOD.r2276_p516_p523_tid310713_00" report = {} #print f.getGlobalFilePaths(dsname) #print f.findGlobalFilePath(lfn, dsname) #print f.getLocalROOTSetup()
def executePayload(self, thisExperiment, job): t0 = os.times() res_tuple = None # loop over all run commands (only >1 for multi-trfs) getstatusoutput_was_interrupted = False job_status = None tolog("About to launch ARGO job") # Poll MQ for Job Status try: # Initiate MQ interface and send job self.argo_job.job_status_routing_key = '%s_job_status' % job.jobId #'status_' + jobID si = SiteInformation() mi = MessageInterface() mi.host = 'atlasgridftp02.hep.anl.gov' mi.port = 5671 mi.ssl_cert = si.getSSLCertificate() #'/grid/atlas/hpc/pilot_certs/xrootdsrv-cert.pem' proxy_cert_path = si.getSSLCertificate() mi.ssl_cert = os.path.dirname(proxy_cert_path) + "/rabbitmq-cert.pem" if 'X509_USER_CERT' in os.environ.keys(): mi.ssl_cert = os.environ['X509_USER_CERT'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-cert.pem' mi.ssl_key = mi.ssl_cert #'/grid/atlas/hpc/pilot_certs/xrootdsrv-key.pem' mi.ssl_key = os.path.dirname(proxy_cert_path) + "/rabbitmq-key.pem" if 'X509_USER_KEY' in os.environ.keys(): mi.ssl_key = os.environ['X509_USER_KEY'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-key.pem' #mi.ssl_ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem" mi.ssl_ca_certs = '/grid/atlas/hpc/pilot_certs/cacerts.pem' #if 'X509_CA_CERTS' in os.environ.keys(): # mi.ssl_ca_certs = os.environ['X509_CA_CERTS'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/cacerts.pem' #tolog("CA certs: %s" % (mi.ssl_ca_certs)) ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem" if os.path.isfile(ca_certs): mi.ssl_ca_certs = ca_certs mi.exchange_name = 'argo_users' #Create queue to get messages about ARGO Job status from MQ tolog('Opening connection with MQ') mi.open_blocking_connection() tolog('Create queue [%s] to retrieve messages with job status' % self.argo_job.job_status_routing_key) mi.create_queue(self.argo_job.job_status_routing_key, self.argo_job.job_status_routing_key) # submit ARGO job to MQ #tolog('Opening connection with MQ') #mi.open_blocking_connection() routing_key = 'argo_job' if self.dev: routing_key = 'argo_job_dev' tolog('Sending msg with job to ARGO') mi.send_msg(self.argo_job.serialize(), routing_key) tolog(' done sending ') # Waiting till job done or failed ARGO_err_msg = '' while True: time.sleep(5) message = mi.receive_msg(self.argo_job.job_status_routing_key, True) if message[2]: tolog ("Got message from queue [%s]: method [%s], properties [%s], body [ %s ]" % (self.argo_job.job_status_routing_key, message[0], message[1], message[2])) job_status = ArgoJobStatus.get_from_message(message[2]) job.hpcStatus = job_status.state rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort()) tolog("Extracted state: %s" % job_status.state) if job_status.state == job_status.HISTORY: res_tuple = (0, "Done") break elif job_status.is_failed(): res_tuple = (1, "Failed") ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message elif job_status.state == job_status.FAILED: res_tuple = (1, "Failed") ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message runJob.failJob(1, 0, job, ins=job.inFiles, pilotErrorDiag=ARGO_err_msg) break time.sleep(5) mi.close() tolog(' closing connection to MQ') tolog("Job State: %s" % (job_status.state)) #job.timeExe = int(fork_job.finished - fork_job.started) #################################################### except Exception, e: tolog("!!FAILED!!3000!! Failed to run command %s" % str(e)) getstatusoutput_was_interrupted = True res_tuple = (1, "Failed") self.failJob(0, self.__error.ERR_GENERALERROR, job, pilotErrorDiag=str(e))
def executePayload(self, thisExperiment, job): t0 = os.times() res_tuple = None # loop over all run commands (only >1 for multi-trfs) getstatusoutput_was_interrupted = False job_status = None tolog("About to launch ARGO job") # Poll MQ for Job Status try: # Initiate MQ interface and send job self.argo_job.job_status_routing_key = '%s_job_status' % job.jobId #'status_' + jobID si = SiteInformation() mi = MessageInterface() mi.host = 'atlasgridftp02.hep.anl.gov' mi.port = 5671 mi.ssl_cert = si.getSSLCertificate( ) #'/grid/atlas/hpc/pilot_certs/xrootdsrv-cert.pem' proxy_cert_path = si.getSSLCertificate() mi.ssl_cert = os.path.dirname( proxy_cert_path) + "/rabbitmq-cert.pem" if 'X509_USER_CERT' in os.environ.keys(): mi.ssl_cert = os.environ[ 'X509_USER_CERT'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-cert.pem' mi.ssl_key = mi.ssl_cert #'/grid/atlas/hpc/pilot_certs/xrootdsrv-key.pem' mi.ssl_key = os.path.dirname(proxy_cert_path) + "/rabbitmq-key.pem" if 'X509_USER_KEY' in os.environ.keys(): mi.ssl_key = os.environ[ 'X509_USER_KEY'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-key.pem' #mi.ssl_ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem" mi.ssl_ca_certs = '/grid/atlas/hpc/pilot_certs/cacerts.pem' #if 'X509_CA_CERTS' in os.environ.keys(): # mi.ssl_ca_certs = os.environ['X509_CA_CERTS'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/cacerts.pem' #tolog("CA certs: %s" % (mi.ssl_ca_certs)) ca_certs = os.path.dirname( proxy_cert_path) + "/rabbitmq-cacerts.pem" if os.path.isfile(ca_certs): mi.ssl_ca_certs = ca_certs mi.exchange_name = 'argo_users' #Create queue to get messages about ARGO Job status from MQ tolog('Opening connection with MQ') mi.open_blocking_connection() tolog('Create queue [%s] to retrieve messages with job status' % self.argo_job.job_status_routing_key) mi.create_queue(self.argo_job.job_status_routing_key, self.argo_job.job_status_routing_key) # submit ARGO job to MQ #tolog('Opening connection with MQ') #mi.open_blocking_connection() routing_key = 'argo_job' if self.dev: routing_key = 'argo_job_dev' tolog('Sending msg with job to ARGO') mi.send_msg(self.argo_job.serialize(), routing_key) tolog(' done sending ') # Waiting till job done or failed ARGO_err_msg = '' while True: time.sleep(5) message = mi.receive_msg(self.argo_job.job_status_routing_key, True) if message[2]: tolog( "Got message from queue [%s]: method [%s], properties [%s], body [ %s ]" % (self.argo_job.job_status_routing_key, message[0], message[1], message[2])) job_status = ArgoJobStatus.get_from_message(message[2]) job.hpcStatus = job_status.state rt = RunJobUtilities.updatePilotServer( job, self.getPilotServer(), self.getPilotPort()) tolog("Extracted state: %s" % job_status.state) if job_status.state == job_status.HISTORY: res_tuple = (0, "Done") break elif job_status.is_failed(): res_tuple = (1, "Failed") ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message elif job_status.state == job_status.FAILED: res_tuple = (1, "Failed") ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message runJob.failJob(1, 0, job, ins=job.inFiles, pilotErrorDiag=ARGO_err_msg) break time.sleep(5) mi.close() tolog(' closing connection to MQ') tolog("Job State: %s" % (job_status.state)) #job.timeExe = int(fork_job.finished - fork_job.started) #################################################### except Exception, e: tolog("!!FAILED!!3000!! Failed to run command %s" % str(e)) getstatusoutput_was_interrupted = True res_tuple = (1, "Failed") self.failJob(0, self.__error.ERR_GENERALERROR, job, pilotErrorDiag=str(e))