def getTier1Queue2(self, cloud): """ Download the queuedata for the Tier-1 in the corresponding cloud and get the queue name """ queuename = "" path = self.getTier1InfoFilename() ec = self.downloadTier1Info() if ec == 0: # Process the downloaded T-1 info f = open(path, 'r') if getExtension() == "json": from json import loads data = loads(f.read()) else: from pickle import load data = load(f) f.close() # Extract the relevant queue info for the given cloud T1_info = [x for x in data if x['cloud']==cloud] # finally get the queue name if T1_info != []: info = T1_info[0] if info.has_key('PanDAQueue'): queuename = info['PanDAQueue'] else: tolog("!!WARNING!!1222!! Returned Tier-1 info object does not have key PanDAQueue: %s" % str(info)) else: tolog("!!WARNING!!1223!! Found no Tier-1 info for cloud %s" % (cloud)) return queuename
def rename(self, site, job): """ Rename the job state file. Should only be called for holding jobs that have passed the maximum number of recovery attempts. """ status = True # get the file extension extension = getExtension() fileNameOld = "%s/jobState-%s.%s" % (site.workdir, job.jobId, extension) fileNameNew = "%s/jobState-%s.%s.MAXEDOUT" % (site.workdir, job.jobId, extension) if os.path.isfile(fileNameOld): # rename the job state file try: os.system("mv %s %s" % (fileNameOld, fileNameNew)) except OSError: tolog("JOBSTATE FAILURE: Failed to rename job state file: %s" % (fileNameOld)) status = False else: tolog("Job state file renamed to: %s" % (fileNameNew)) else: tolog("JOBSTATE FAILURE: Job state file does not exist: %s" % (fileNameOld)) status = False return status
def getTier1InfoFilename(self): """ Get the Tier-1 info file name """ filename = "Tier-1_info.%s" % (getExtension()) path = "%s/%s" % (os.environ['PilotHomeDir'], filename) return path
def ReadQueueParams(): queuedata = "" verified = False si = SiteInformation() qdfname = si.getQueuedataFileName(check=False) # qdfname = "%s/queuedata.dat" % (os.environ['PilotHomeDir']) # qdfname = "%s/queuedata.json" % (os.environ['PilotHomeDir']) if os.path.exists(qdfname): print "queuedata already downloaded" verified, queuedata = readqueuedata(qdfname, first_trial=True) else: print "Downloading queuedata in atlasProdPilot" # download queuedata and verify it extension = utils.getExtension(alternative='dat') if extension == "json": _ext = extension else: _ext = "pilot" curl_url = 'http://pandaserver.cern.ch:25085/cache/schedconfig/%s.pilot.%s' % (qname, _ext) si = SiteInformation() sslCertificate = si.getSSLCertificate() cmd = 'curl --connect-timeout 20 --max-time 120 --cacert %s -sS "%s" > %s' % (sslCertificate, curl_url, qdfname) print "Executing command: %s" % (cmd) try: ret, rs = commands.getstatusoutput(cmd) except Exception, e: print "!!WARNING!!1999!! Failed with curl command: %s" % str(e) else:
def getQueuedataFileName(self, useExtension=None, check=True, alt=False): """ Define the queuedata filename """ # use a forced extension if necessary if useExtension: extension = useExtension else: extension = getExtension(alternative='dat') # prepend alt. for alternative stage-out site queuedata if alt: extension = "alt." + extension path = "%s/queuedata.%s" % (os.environ['PilotHomeDir'], extension) # remove the json extension if the file cannot be found (complication due to wrapper) if not os.path.exists(path) and check: if extension == 'json': _path = path.replace('.json', '.dat') if os.path.exists(_path): tolog("Updating queuedata file name to: %s" % (_path)) path = _path else: tolog("!!WARNING!! Queuedata paths do not exist: %s, %s" % (path, _path)) if extension == 'dat': _path = path.replace('.dat', '.json') if os.path.exists(_path): tolog("Updating queuedata file name to: %s" % (_path)) path = _path else: tolog("!!WARNING!! Queuedata paths do not exist: %s, %s" % (path, _path)) return path
def getTrfExitInfo(exitCode, workdir): """ Get the trf exit code and info from job report if possible """ exitAcronym = "" exitMsg = "" # does the job report exist? extension = getExtension(alternative='pickle') if extension.lower() == "json": filename = os.path.join(workdir, "jobReport.%s" % (extension)) else: filename = os.path.join(workdir, "jobReportExtract.%s" % (extension)) if os.path.exists(filename): tolog("Found job report: %s" % (filename)) # search for the exit code try: f = open(filename, "r") except Exception, e: tolog("!!WARNING!!1112!! Failed to open job report: %s" % (e)) else: if extension.lower() == "json": from json import load else: from pickle import load data = load(f) # extract the exit code and info _exitCode = extractDictionaryObject("exitCode", data) if _exitCode: if _exitCode == 0 and exitCode != 0: tolog("!!WARNING!!1111!! Detected inconsistency in %s: exitcode listed as 0 but original trf exit code was %d (using original error code)" %\ (filename, exitCode)) else: exitCode = _exitCode _exitAcronym = extractDictionaryObject("exitAcronym", data) if _exitAcronym: exitAcronym = _exitAcronym _exitMsg = extractDictionaryObject("exitMsg", data) if _exitMsg: exitMsg = _exitMsg f.close() tolog("Trf exited with:") tolog("...exitCode=%d" % (exitCode)) tolog("...exitAcronym=%s" % (exitAcronym)) tolog("...exitMsg=%s" % (exitMsg))
def getQueuedata(self, queuename, forceDownload=False, alt=False): """ Download the queuedata if not already downloaded """ # read the queue parameters and create the queuedata.dat file (if the queuename varible is set) # pilot home dir is not the workdir. It's the sandbox directory for the pilot. # queuedata structure: (with example values) # appdir=/ifs0/osg/app/atlas_app/atlas_rel # dq2url=http://gk03.swt2.uta.edu:8000/dq2/ # copytool=gridftp # copysetup= # ddm=UTA_SWT2 # se=http://gk03.swt2.uta.edu:8000/dq2/ # sepath= # envsetup= # region=US # copyprefix= # lfcpath= # lfchost= # sein= # wntmpdir=/scratch # curl --connect-timeout 20 --max-time 120 -sS "http://pandaserver.cern.ch:25085/cache/schedconfig/BNL_CVMFS_1-condor.pilot.pilot" if not os.environ.has_key('PilotHomeDir'): os.environ['PilotHomeDir'] = commands.getoutput('pwd') hasQueuedata = False # try the config servers one by one in case one of them is not responding # in case the wrapper has already downloaded the queuedata, it might have a .dat extension # otherwise, give it a .json extension if possible filename_dat = self.getQueuedataFileName(useExtension='dat', check=False, alt=alt) if os.path.exists(filename_dat): filename = filename_dat else: filename = self.getQueuedataFileName(check=False, alt=alt) if os.path.exists(filename) and not forceDownload: tolog("Queuedata has already been downloaded by pilot wrapper script (will confirm validity)") hasQueuedata = self.verifyQueuedata(queuename, filename, 1, 1, "(see batch log for url)") if hasQueuedata: tolog("Queuedata was successfully downloaded by pilot wrapper script") else: tolog("Queuedata was not downloaded successfully by pilot wrapper script, will try again") if not hasQueuedata: # loop over pandaserver round robin _N times until queuedata has been verified, or fail ret = -1 url = 'http://pandaserver.cern.ch' if os.environ.has_key('X509_USER_PROXY'): sslCert = os.environ['X509_USER_PROXY'] else: sslCert = '/tmp/x509up_u%s' % str(os.getuid()) cmd = 'curl --connect-timeout 20 --max-time 120 --cacert %s -sS "%s:25085/cache/schedconfig/%s.pilot.%s" > %s' % \ (sslCert, url, queuename, getExtension(alternative='pilot'), filename) _N = 3 for _i in range(_N): tolog("Executing command: %s" % (cmd)) try: # output will be empty since we pipe into a file ret, output = commands.getstatusoutput(cmd) except Exception, e: tolog("!!WARNING!!1999!! Failed with curl command: %s" % str(e)) return -1, False else: if ret == 0: # read back the queuedata to verify its validity hasQueuedata = self.verifyQueuedata(queuename, filename, _i, _N, url) if hasQueuedata: break else: tolog("!!WARNING!!1999!! curl command exited with code %d" % (ret))