def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ copy input file from SE to local dir """ error = PilotErrors() # Get input parameters from pdict jobId = pdict.get('jobId', '') workDir = pdict.get('workDir', '') experiment = pdict.get('experiment', '') pandaProxySecretKey = pdict.get('pandaProxySecretKey') jobSetID = pdict.get('jobsetID') # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 's3objectstorepresignedurl', lfn, guid) if path == '': path = './' fullname = os.path.join(path, lfn) # pandaID, filename, jobSetID, pandaProxySecretKey=None, stageIn=True status, output = self.stageIn(jobId, lfn, jobSetID, pandaProxySecretKey, fullname, fsize, fchecksum, experiment) if status == 0: updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", type="input") state = "DONE" else: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" # self.__sendReport(state, report) self.prepareReport(state, report) return status, output
def TransferFiles(job_state, datadir, files, **kwargs): """ Transfers files from list 'files' May change CWD with pUtil.chdir (several times) :param job_state: :param datadir: job data dir :param files: list of filenames :param kwargs: specific arguments for other purposes :return: """ job = job_state.job pUtil.chdir(datadir) XMLMetadata = pUtil.getMetadata(job_state.site.workdir, job.jobId) thisSite = DorE(kwargs, 'thisSite') if not setGuids(job_state, files, **kwargs): job.result[2] = PilotErrors().ERR_LOSTJOBPFC return ReturnCode.FailedJob outPFC = updateOutPFC(job, **kwargs) if not outPFC: return ReturnCode.FailedJob dsname = defaultDSname(job.destinationDblock) datasetDict = pUtil.getDatasetDict(job.outFiles, job.destinationDblock, job.logFile, job.logDblock) if not datasetDict: log("Output files will go to default dataset: %s" % (dsname)) # the cmtconfig is needed by at least the xrdcp site mover cmtconfig = pUtil.getCmtconfig(job.cmtconfig) tin_0 = os.times() rf = None _state = ReturnCode.OK _msg = "" ec = -1 try: # Note: alt stage-out numbers are not saved in recovery mode (job object not returned from this function) rc, pilotErrorDiag, rf, rs, job.filesNormalStageOut, job.filesAltStageOut, os_bucket_id = Mover.mover_put_data( "xmlcatalog_file:%s" % outPFC, dsname, thisSite.sitename, thisSite.computingElement, analysisJob=pUtil.isAnalysisJob(job.trf.split(",")[0]), proxycheck=DorE(kwargs, 'proxycheckFlag'), pinitdir=DorE(kwargs, 'pilot_initdir'), datasetDict=datasetDict, stageoutTries=DorE(kwargs, 'stageoutretry'), cmtconfig=cmtconfig, recoveryWorkDir=thisSite.workdir, job=job) except Exception, e: pilotErrorDiag = "Put function can not be called for staging out: %s" % str(e) log("!!%s!!1105!! %s" % (env['errorLabel'], pilotErrorDiag)) ec = PilotErrors().ERR_PUTFUNCNOCALL _state = ReturnCode.Holding _msg = env['errorLabel']
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ copy output file from disk to local SE """ # function is based on dCacheSiteMover put function error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') token = pdict.get('token', '') scope = pdict.get('scope', '') dsname = pdict.get('dsname', '') experiment = pdict.get('experiment', '') outputDir = pdict.get('outputDir', '') os_bucket_id = pdict.get('os_bucket_id', -1) timeout = pdict.get('timeout', None) if not timeout: timeout = self.timeout # get the site information object si = getSiteInformation(experiment) # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 's3objectstore', lfn, guid) parsed = urlparse.urlparse(destination) scheme = parsed.scheme hostname = parsed.netloc.partition(':')[0] port = int(parsed.netloc.partition(':')[2]) report['remoteSite'] = '%s://%s:%s' % (scheme, hostname, port) filename = os.path.basename(source) surl = destination self.log("surl=%s, timeout=%s" % (surl, timeout)) if "log.tgz" in surl: surl = surl.replace(lfn, "%s:%s"%(scope,lfn)) else: report['eventType'] = 'put_es' status, output, size, checksum = self.stageOut(source, surl, token, experiment, outputDir=outputDir, timeout=timeout, os_bucket_id=os_bucket_id, report=report) if status !=0: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" # self.__sendReport(state, report) self.prepareReport(state, report) return self.put_data_retfail(status, output, surl) state = "DONE" # self.__sendReport(state, report) self.prepareReport(state, report) return 0, pilotErrorDiag, surl, size, checksum, self.arch_type
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ copy output file from disk to local SE """ # function is based on dCacheSiteMover put function error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict alt = pdict.get('alt', False) jobId = pdict.get('jobId', '') jobSetID = pdict.get('jobsetID', '') lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') #token = pdict.get('token', '') scope = pdict.get('scope', '') dsname = pdict.get('dsname', '') analysisJob = pdict.get('analJob', False) testLevel = pdict.get('testLevel', '0') extradirs = pdict.get('extradirs', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) prodSourceLabel = pdict.get('prodSourceLabel', '') outputDir = pdict.get('outputDir', '') timeout = pdict.get('timeout', None) pandaProxySecretKey = pdict.get('pandaProxySecretKey') if not timeout: timeout = self.timeout # get the site information object si = getSiteInformation(experiment) tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel)) # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 's3objectstorepresignedurl', lfn, guid) filename = os.path.basename(source) surl = destination status, output, size, checksum = self.stageOut(source, jobId, lfn, jobSetID, pandaProxySecretKey, experiment, outputDir=outputDir, timeout=timeout) if status !=0: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" # self.__sendReport(state, report) self.prepareReport(state, report) return self.put_data_retfail(status, output, surl) state = "DONE" # self.__sendReport(state, report) # self.prepareReport(state, report) return 0, pilotErrorDiag, surl, size, checksum, self.arch_type
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ copy input file from SE to local dir """ error = PilotErrors() # Get input parameters from pdict jobId = pdict.get('jobId', '') workDir = pdict.get('workDir', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) # try to get the direct reading control variable (False for direct reading mode; file should not be copied) useCT = pdict.get('usect', True) prodDBlockToken = pdict.get('access', '') # get the DQ2 tracing report report = self.getStubTracingReport(pdict['report'], 'gfal-copy', lfn, guid) if path == '': path = './' fullname = os.path.join(path, lfn) # get the site information object si = getSiteInformation(experiment) ret_path = si.getCopyPrefixPathNew(gpfn, stageIn=True) if not ret_path.startswith("s3:"): errorLog = "Failed to use copyprefix to convert the current path to S3 path." tolog("!!WARNING!!1777!! %s" % (errorLog)) status = PilotErrors.ERR_STAGEINFAILED state = "PSTAGE_FAIL" output = errorLog else: gpfn = ret_path status, output = self.stageIn(gpfn, fullname, fsize, fchecksum, experiment) if status == 0: updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", type="input") state = "DONE" else: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" self.prepareReport(state, report) return status, output
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ copy output file from disk to local SE """ # function is based on dCacheSiteMover put function error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict alt = pdict.get('alt', False) lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') token = pdict.get('token', '') scope = pdict.get('scope', '') dsname = pdict.get('dsname', '') analysisJob = pdict.get('analJob', False) testLevel = pdict.get('testLevel', '0') extradirs = pdict.get('extradirs', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) prodSourceLabel = pdict.get('prodSourceLabel', '') # get the site information object si = getSiteInformation(experiment) tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel)) if prodSourceLabel == 'ddm' and analysisJob: tolog( "Treating PanDA Mover job as a production job during stage-out" ) analysisJob = False # get the DQ2 tracing report report = self.getStubTracingReport(pdict['report'], 's3objectstore', lfn, guid) filename = os.path.basename(source) surl = destination status, output, size, checksum = self.stageOut(source, surl, token, experiment) if status != 0: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" self.prepareReport(state, report) return self.put_data_retfail(status, output, surl) state = "DONE" self.prepareReport(state, report) return 0, pilotErrorDiag, surl, size, checksum, self.arch_type
def pollChildren(self): """ check children processes, collect zombie jobs, and update jobDic status """ error = PilotErrors() # tolog("---pollChildren: %s" % str(jobDic)) for k in self.__env['jobDic'].keys(): try: _id, rc = os.waitpid(self.__env['jobDic'][k][0], os.WNOHANG) except OSError, e: try: if self.__env['jobDic'][k][1].result[ 0] == "finished" or self.__env['jobDic'][k][ 1].result[0] == "failed" or self.__env[ 'jobDic'][k][1].result[0] == "holding": continue except: pUtil.tolog( "!!FAILED!!1000!! Pilot failed to check the job state: %s" % traceback.format_exc()) pUtil.tolog("Harmless exception when checking job %s, %s" % (self.__env['jobDic'][k][1].jobId, e)) if str(e).rstrip() == "[Errno 10] No child processes": pilotErrorDiag = "Exception caught by pilot watchdog: %s" % str( e) pUtil.tolog( "!!FAILED!!1000!! Pilot setting state to failed since there are no child processes" ) pUtil.tolog("!!FAILED!!1000!! %s" % (pilotErrorDiag)) self.__env['jobDic'][k][1].result[0] = "failed" self.__env['jobDic'][k][1].currentState = self.__env[ 'jobDic'][k][1].result[0] if self.__env['jobDic'][k][1].result[2] == 0: self.__env['jobDic'][k][1].result[ 2] = error.ERR_NOCHILDPROCESSES if self.__env['jobDic'][k][1].pilotErrorDiag == "": self.__env['jobDic'][k][ 1].pilotErrorDiag = pilotErrorDiag else: pass else: if _id: # finished rc = rc % 255 # exit code if k == "prod": # production job is done self.__prodJobDone = True pUtil.tolog("Production job is done") if self.__env['jobDic'][k][1].result[ 0] != "finished" and self.__env['jobDic'][k][ 1].result[0] != "failed" and self.__env[ 'jobDic'][k][1].result[0] != "holding": if not rc: # rc=0, ok job if not self.__env['jobDic'][k][1].result[1]: self.__env['jobDic'][k][1].result[ 1] = rc # transExitCode (because pilotExitCode is reported back by child job) else: # rc != 0, failed job self.__env['jobDic'][k][1].result[ 1] = rc # transExitCode
def verifyMultiTrf(jobParameterList, jobHomePackageList, jobTrfList, jobAtlasRelease): """ make sure that a multi-trf (or single trf) job is properly setup """ error = PilotErrors() ec = 0 pilotErrorDiag = "" N_jobParameterList = len(jobParameterList) N_jobHomePackageList = len(jobHomePackageList) N_jobTrfList = len(jobTrfList) N_jobAtlasRelease = len(jobAtlasRelease) # test jobs have multiple atlas releases defined, but not real tasks if N_jobTrfList > N_jobAtlasRelease and N_jobAtlasRelease == 1: # jobAtlasRelease = ['14.0.0'] -> ['14.0.0', '14.0.0'] jobAtlasRelease = jobAtlasRelease*N_jobTrfList N_jobAtlasRelease = len(jobAtlasRelease) if (N_jobParameterList == N_jobHomePackageList) and \ (N_jobHomePackageList == N_jobTrfList) and \ (N_jobTrfList == N_jobAtlasRelease): if N_jobAtlasRelease == 1: tolog("Multi-trf verification succeeded (single job)") else: tolog("Multi-trf verification succeeded") else: pilotErrorDiag = "Multi-trf verification failed: N(jobPars) eq %d, but N(homepackage,transformation,AtlasRelease) eq (%d,%d,%d)" %\ (N_jobParameterList, N_jobHomePackageList, N_jobTrfList, N_jobAtlasRelease) tolog("!!FAILED!!2999!! %s" % (pilotErrorDiag)) ec = error.ERR_SETUPFAILURE return ec, pilotErrorDiag, jobAtlasRelease
def put_data(self, pfn, ddm_storage, fsize=0, fchecksum=0, dsname='', extradirs='', **pdict): """ Should be generic: executes setup and command after it """ error = PilotErrors() pilotErrorDiag = "" if fsize == 0 or fchecksum == 0: ec, pilotErrorDiag, fsize, fchecksum = getLocalFileInfo( source, csumtype="adler32") if ec != 0: return SiteMover.put_data_retfail(ec, pilotErrorDiag) s, o = commands.getstatusoutput( 'source %s; %s %s %s' % (self._setup, self._copycmd, source, destination)) if s != 0: check_syserr(s, o) pilotErrorDiag = "Error during copy: %s" % (o) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) return SiteMover.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag) # TODO: how are md5 and size controlled? return 0, pilotErrorDiag, destination, fsize, fchecksum, ARCH_DEFAULT
def put_data(self, source, ddm_storage, fsize=0, fchecksum=0, dsname='', **pdict): """ Data transfer using rfcp - generic version It's not advisable to use this right now because there's no easy way to register the srm space token if the file is copied with rfcp """ error = PilotErrors() # Get input parameters from pdict lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 'castorSVC', lfn, guid) pilotErrorDiag = "put_data does not work for this mover" tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag)) self.prepareReport('NOT_IMPL', report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag)
def core_get_data(self, envsetup, token, source_surl, dest_path, experiment): """ stage-in core function, can be overridden (see stormSiteMover) """ error = PilotErrors() # determine which timeout option to use timeout_option = "--connect-timeout 300 --max-time %d" % (self.timeout) sslCert = self.sslCert sslKey = self.sslKey sslCertDir = self.sslCertDir # used curl options: # --cert: <cert[:passwd]> Client certificate file and password (SSL) # --capath: <directory> CA directory (made using c_rehash) to verify # --location: Follow Location: hints (H) # --output: <file> Write output to <file> instead of stdout # --cilent: Makes Curl mute # --show-error: When used with -s it makes curl show error message if it fails # Removed for SL6: --ciphers <list of ciphers> (SSL) Specifies which ciphers to use in the connection. """ define curl command string """ _cmd_str = 'lcg-gt %s https' % (source_surl) try: s, o = commands.getstatusoutput(_cmd_str) tolog("Executing command: %s" % (_cmd_str)) except Exception, e: tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" % (str(e), s, o)) o = str(e)
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ copy output file from disk to local SE """ # function is based on dCacheSiteMover put function error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict alt = pdict.get('alt', False) lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') token = pdict.get('token', '') scope = pdict.get('scope', '') dsname = pdict.get('dsname', '') analysisJob = pdict.get('analJob', False) testLevel = pdict.get('testLevel', '0') extradirs = pdict.get('extradirs', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) prodSourceLabel = pdict.get('prodSourceLabel', '') # get the site information object si = getSiteInformation(experiment) tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel)) if prodSourceLabel == 'ddm' and analysisJob: tolog( "Treating PanDA Mover job as a production job during stage-out" ) analysisJob = False # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 'xrdcp', lfn, guid) filename = os.path.basename(source) # get all the proper paths ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths( error, analysisJob, token, prodSourceLabel, dsname, filename, scope=scope, alt=alt, sitemover=self) # quick workaround if ec != 0: reportState = {} reportState["clientState"] = tracer_error self.prepareReport(reportState, report) return self.put_data_retfail(ec, pilotErrorDiag) # get the RSE from ToA try: _RSE = self.getRSE(surl=surl) except Exception, e: tolog( "Warning: Failed to get RSE: %s (can not add this info to tracing report)" % str(e))
def sig2exc(sig, frm): """ signal handler """ error = PilotErrors() runJob.setGlobalPilotErrorDiag( "!!FAILED!!3000!! SIGTERM Signal %s is caught in child pid=%d!\n" % (sig, os.getpid())) tolog(runJob.getGlobalPilotErrorDiag()) if sig == signal.SIGTERM: runJob.setGlobalErrorCode(error.ERR_SIGTERM) elif sig == signal.SIGQUIT: runJob.setGlobalErrorCode(error.ERR_SIGQUIT) elif sig == signal.SIGSEGV: runJob.setGlobalErrorCode(error.ERR_SIGSEGV) elif sig == signal.SIGXCPU: runJob.setGlobalErrorCode(error.ERR_SIGXCPU) elif sig == signal.SIGBUS: runJob.setGlobalErrorCode(error.ERR_SIGBUS) elif sig == signal.SIGUSR1: runJob.setGlobalErrorCode(error.ERR_SIGUSR1) else: runJob.setGlobalErrorCode(error.ERR_KILLSIGNAL) runJob.setFailureCode(runJob.getGlobalErrorCode) # print to stderr print >> sys.stderr, runJob.getGlobalPilotErrorDiag() raise SystemError(sig)
def core_get_data(self, envsetup, token, source_surl, dest_path, experiment): """ stage-in core function, can be overridden (see stormSiteMover) """ error = PilotErrors() # determine which timeout option to use if self.isNewLCGVersion("%s lcg-cp" % (envsetup)): timeout_option = "--srm-timeout=%d --connect-timeout=300 --sendreceive-timeout=%d" % ( self.timeout, self.timeout) else: timeout_option = "-t %d" % (self.timeout) # used lcg-cp options: # --vo: specifies the Virtual Organization the user belongs to # -t: time-out if token: # do not use option -b on SL3 clusters running older versions of lcg_utils use_b = True s, o = commands.getstatusoutput("%s lcg-cr --version" % (envsetup)) if s != 0: # (BDII collects all information coming from site GIISes and stores them in a permanent database) tolog("(Probably too old lcg_utils - skipping BDII disabling)") use_b = False # for the time being use_b = False if use_b: _cmd_str = '%s lcg-cp --vo atlas --srcsetype srmv2 -s %s -b %s %s file://%s' %\ (envsetup, token, timeout_option, source_surl, dest_path) else: tolog("(Skipping space token for the time being)") _cmd_str = '%s lcg-cp --vo atlas %s %s file://%s' % ( envsetup, timeout_option, source_surl, dest_path) else: _cmd_str = '%s lcg-cp --vo atlas %s %s file://%s' % ( envsetup, timeout_option, source_surl, dest_path) # get the experiment object thisExperiment = getExperiment(experiment) # add the full stage-out command to the job setup script to_script = _cmd_str.replace("file://%s" % os.path.dirname(dest_path), "file://`pwd`") to_script = to_script.lstrip(' ') # remove any initial spaces if to_script.startswith('/'): to_script = 'source ' + to_script thisExperiment.updateJobSetupScript(os.path.dirname(dest_path), to_script=to_script) tolog("Executing command: %s" % (_cmd_str)) s = -1 o = '(not defined)' t0 = os.times() try: s, o = commands.getstatusoutput(_cmd_str) except Exception, e: tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" % (str(e), s, o)) o = str(e)
def pollChildren(self): """ check children processes, collect zombie jobs, and update jobDic status """ pUtil.tolog("Watchdog to check children processes") error = PilotErrors() try: _id, rc = os.waitpid(self.__env['jobDic']['prod'][0], os.WNOHANG) except OSError, e: pUtil.tolog("Harmless exception when checking child process %s, %s" % (self.__env['jobDic']['prod'][0], e)) if str(e).rstrip() == "[Errno 10] No child processes": pilotErrorDiag = "Exception caught by pilot watchdog: %s" % str(e) for j in self.__env['jobDic']['prod'][1]: pUtil.tolog("Watchdog. JobID: %s, status [%s]" % (j.jobId, j.result[0])) if j.result[0] in ["finished", "failed", "holding", "transferring"]: pUtil.tolog("Job: %s already %s" % (j.jobId, j.result[0])) else: pUtil.tolog("!!FAILED!!1000!! Pilot setting state to failed since there are no child processes") pUtil.tolog("!!FAILED!!1000!! %s" % (pilotErrorDiag)) pUtil.tolog("Watchdog will fail JobID: %s status: [%s]" % (j.jobId, j.result[0])) j.result[0] = "failed" j.currentState = j.result[0] if j.result[2] == 0: j.result[2] = error.ERR_NOCHILDPROCESSES if j.pilotErrorDiag == "": j.pilotErrorDiag = pilotErrorDiag else: pass
class DBReleaseHandler: """ Methods for handling the DBRelease file and possibly skip it in the input file list In the presence of $[VO_ATLAS_SW_DIR|OSG_APP]/database, the pilot will use these methods to: 1. Extract the requested DBRelease version from the job parameters string, if present 2. Scan the $[VO_ATLAS_SW_DIR|OSG_APP]/database dir for available DBRelease files 3. If the requested DBRelease file is available, continue [else, abort at this point] 4. Create a DBRelease setup file containing necessary environment variables 5. Create a new DBRelease file only containing the setup file in the input file directory 6. Update the job state file 7. Remove the DBRelease file from the input file list if all previous steps finished correctly """ # private data members __error = PilotErrors() # PilotErrors object __version = "" __DBReleaseDir = "" __filename = "DBRelease-%s.tar.gz" __setupFilename = "setup.py" __workdir = "" def __init__(self, workdir=""): """ Default initialization """ _path = self.getDBReleaseDir() # _path is a dummy variable self.__workdir = workdir def removeDBRelease(self, inputFiles, inFilesGuids, realDatasetsIn, dispatchDblock, dispatchDBlockToken, prodDBlockToken): """ remove the given DBRelease files from the input file list """ # will only remove the DBRelease files that are already available locally # identify all DBRelease files in the list (mark all for removal) # note: multi-trf jobs tend to have the same DBRelease file listed twice position = 0 positions_list = [] for f in inputFiles: if "DBRelease" in f: positions_list.append(position) tolog("Will remove file %s from input file list" % (f)) position += 1 # remove the corresponding guids, datasets and tokens for position in positions_list: try: del (inputFiles[position]) except Exception, e: tolog( "!!WARNING!!1990!! Could not delete object %d in inFiles: %s" % (position, str(e))) else: tolog("Removed item %d in inFiles" % (position)) try: del (inFilesGuids[position]) except Exception, e: tolog( "!!WARNING!!1990!! Could not delete object %d in inFilesGuids: %s" % (position, str(e))) else:
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ copy input file from SE to local dir """ error = PilotErrors() # Get input parameters from pdict jobId = pdict.get('jobId', '') workDir = pdict.get('workDir', '') experiment = pdict.get('experiment', '') os_bucket_id = pdict.get('os_bucket_id', -1) # try to get the direct reading control variable (False for direct reading mode; file should not be copied) useCT = pdict.get('usect', True) prodDBlockToken = pdict.get('access', '') # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 's3objectstore', lfn, guid) if path == '': path = './' fullname = os.path.join(path, lfn) status, output = self.stageIn(gpfn, fullname, fsize, fchecksum, experiment, os_bucket_id=os_bucket_id) if status == 0: updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", ftype="input") state = "DONE" else: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" # self.__sendReport(state, report) self.prepareReport(state, report) return status, output
def core_get_data(self, envsetup, token, source_surl, local_fullname, experiment): """ special get function developed for storm sites """ error = PilotErrors() # Transform the surl into a full surl full_se_endpoint = self.extractSE(readpar('se').split(",")[0])[1] prefix = os.path.commonprefix([source_surl, full_se_endpoint]) if prefix: # Can use the bdii-free form source_surl = full_se_endpoint + source_surl[len(prefix):] _cmd_str = '%s lcg-gt --nobdii --setype srmv2 "%s" file' % ( envsetup, source_surl) else: # Fallback solution, use old lcg-gt form # get the TURL using the SURL tolog( "!!WARNING!1234!! Source surl does not match %s, cannot use the bdii-independent lcg-gt" % full_se_endpoint) _cmd_str = '%s lcg-gt "%s" file' % (envsetup, source_surl) tolog("Executing command: %s" % (_cmd_str)) t0 = os.times() s, o = commands.getstatusoutput(_cmd_str) t1 = os.times() t = t1[4] - t0[4] tolog("Command finished after %f s" % (t)) if s == 0: # get the experiment object thisExperiment = getExperiment(experiment) # add the full stage-out command to the job setup script to_script = _cmd_str to_script = to_script.lstrip(' ') # remove any initial spaces if to_script.startswith('/'): to_script = 'source ' + to_script thisExperiment.updateJobSetupScript( os.path.dirname(local_fullname), to_script=to_script) source_turl, req_token = o.split('\n') source_turl = source_turl.replace('file://', '') tolog("Creating link from %s to %s" % (source_turl, local_fullname)) try: os.symlink(source_turl, local_fullname) _cmd_str = '%s lcg-sd %s %s 0' % (envsetup, source_surl, req_token) tolog("Executing command: %s" % (_cmd_str)) s, o = commands.getstatusoutput(_cmd_str) # Do we need to check the exit status of lcg-sd? What do we do if it fails? tolog("get_data succeeded") except Exception, e: pilotErrorDiag = "Exception caught: %s" % str(e) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) tolog("get_data failed") return error.ERR_STAGEINFAILED, pilotErrorDiag
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ The local file (local access to the dCache file) is assumed to have a relative path that is the same of the relative path in the 'gpfn' loc_... are the variables used to access the file in the locally exported file system TODO: document GPFN format TODO: document better constraint """ error = PilotErrors() pilotErrorDiag = "" # get the DQ2 tracing report try: report = pdict['report'] except: report = {} else: # set the proper protocol report['protocol'] = 'HU' # mark the relative start report['relativeStart'] = time() # the current file report['filename'] = lfn # guid report['guid'] = guid.replace('-','') # get a proper envsetup envsetup = self.getEnvsetup(get=True) if self._setup: _setup_str = "source %s; " % self._setup else: _setup_str = envsetup try: timeout = pdict['timeout'] except: timeout = 5*3600 if( gpfn.find('SFN') != -1 ): s = gpfn.split('SFN=') loc_pfn = s[1] else: _tmp = gpfn.split('/', 3) loc_pfn = '/'+_tmp[3] _cmd_str = '%snecp %s %s/%s' % (_setup_str, loc_pfn, path, lfn) tolog("NECP executing (timeout %s): %s" % (timeout, _cmd_str)) report['transferStart'] = time() try: s, telapsed, cout, cerr = timed_command(_cmd_str, timeout) except Exception, e: tolog("!!WARNING!!2999!! timed_command() threw an exception: %s" % str(e)) s = 1 o = str(e) telapsed = timeout
def getOutFilesGuids(outFiles, workdir, experiment, TURL=False): """ get the outFilesGuids from the PFC """ ec = 0 pilotErrorDiag = "" outFilesGuids = [] # Get the experiment object and the GUID source filename thisExperiment = getExperiment(experiment) filename = thisExperiment.getGUIDSourceFilename() # If a source file should not be used (ie empty filename string), then generate the GUIDs here if filename == "": tolog("Pilot will generate GUIDs for the output files") for i in range(0, len(outFiles)): guid = getGUID() if guid == "": guid = "- GUID generation failed -" outFilesGuids.append(guid) return ec, pilotErrorDiag, outFilesGuids else: tolog("Pilot will get GUIDs for the output files from source: %s" % (filename)) pfcFile = os.path.join(workdir, filename) #"%s/PoolFileCatalog.xml" % (workdir) # The PFC used for Event Service will be TURL based, use the corresponding file if TURL: pfcFile = pfcFile.replace(".xml", "TURL.xml") # Initialization: make sure the guid list has the same length as the file list for i in range(0, len(outFiles)): outFilesGuids.append(None) # make sure the PFC exists if os.path.isfile(pfcFile): from xml.dom import minidom xmldoc = minidom.parse(pfcFile) fileList = xmldoc.getElementsByTagName("File") for thisfile in fileList: gpfn = str( thisfile.getElementsByTagName("pfn")[0].getAttribute("name")) guid = str(thisfile.getAttribute("ID")) for i in range(0, len(outFiles)): if outFiles[i] == gpfn: outFilesGuids[i] = guid else: pilotErrorDiag = "PFC file does not exist: %s" % (pfcFile) tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag)) error = PilotErrors() ec = error.ERR_MISSINGPFC return ec, pilotErrorDiag, outFilesGuids
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ copy input file from SE to local dir """ error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict jobId = pdict.get('jobId', '') workDir = pdict.get('workDir', '') experiment = pdict.get('experiment', '') transferType = pdict.get('transferType', '') proxycheck = pdict.get('proxycheck', False) # try to get the direct reading control variable (False for direct reading mode; file should not be copied) useCT = pdict.get('usect', True) prodDBlockToken = pdict.get('access', '') # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 'local', lfn, guid) tolog("transferType=%s" % (transferType)) status, output = self.getStageInMode(lfn, prodDBlockToken, transferType) tolog("output=%s" % str(output)) if output["transfer_mode"]: updateFileState(lfn, workDir, jobId, mode="transfer_mode", state=output["transfer_mode"], ftype="input") tolog( "updated file state for lfn=%s, workDir=%s, jobId=%s, state=%s" % (lfn, workDir, jobId, output["transfer_mode"])) if status != 0: self.prepareReport(output["report"], report) return status, output["errorLog"] if path == '': path = './' fullname = os.path.join(path, lfn) status, output = self.stageIn(gpfn, fullname, fsize, fchecksum, experiment) if status == 0: updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", ftype="input") self.prepareReport(output["report"], report) return status, output["errorLog"]
def updateOutPFC(job, **kwargs): file_name = "OutPutFileCatalog.xml" file_path = os.path.join(DorE(kwargs, 'thisSite').workdir, file_name) try: guids_status = pUtil.PFCxml(job.experiment, file_path, job.outFiles, fguids=job.outFilesGuids, fntag="pfn", analJob=pUtil.isAnalysisJob(job.trf.split(",")[0]), jr=True) except Exception, e: log("!!FAILED!!1105!! Exception caught (Could not generate xml for the remaining output files): %s" % str(e)) job.result[2] = PilotErrors().ERR_LOSTJOBXML return False
def setupNordugridTrf(self, job, analysisJob, wgetCommand, pilot_initdir): """ perform the Nordugrid trf setup """ error = PilotErrors() pilotErrorDiag = "" cmd = "" # assume that the runtime script has already been created if not os.environ.has_key('RUNTIME_CONFIG_DIR'): pilotErrorDiag = "Environment variable not set: RUNTIME_CONFIG_DIR" tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag)) return error.ERR_SETUPFAILURE, pilotErrorDiag, "" runtime_script = "%s/APPS/HEP/ATLAS-%s" % ( os.environ['RUNTIME_CONFIG_DIR'], job.release) if os.path.exists(runtime_script): cmd = ". %s 1" % (runtime_script) if analysisJob: # try to download the analysis trf status, pilotErrorDiag, trfName = self.getAnalysisTrf( wgetCommand, job.trf, pilot_initdir) if status != 0: return status, pilotErrorDiag, "" trfName = "./" + trfName else: trfName = job.trf cmd += '; export ATLAS_RELEASE=%s;export AtlasVersion=%s;export AtlasPatchVersion=%s' % ( job.homePackage.split('/')[-1], job.homePackage.split('/')[-1], job.homePackage.split('/')[-1]) cmd += "; %s %s" % (trfName, job.jobPars) elif verifyReleaseString(job.release) == "NULL": if analysisJob: # try to download the analysis trf status, pilotErrorDiag, trfName = self.getAnalysisTrf( wgetCommand, job.trf, pilot_initdir) if status != 0: return status, pilotErrorDiag, "" trfName = "./" + trfName else: trfName = job.trf cmd = "%s %s" % (trfName, job.jobPars) else: pilotErrorDiag = "Could not locate runtime script: %s" % ( runtime_script) tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag)) return error.ERR_SETUPFAILURE, pilotErrorDiag, "" # correct for multi-core if necessary (especially important in case coreCount=1 to limit parallel make) cmd = self.addMAKEFLAGS(job.coreCount, "") + cmd return 0, pilotErrorDiag, cmd
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): "Executes setup and command after it" error = PilotErrors() ec = 0 pilotErrorDiag = "" s, o = commands.getstatusoutput( 'source %s; %s %s %s' % (self._setup, self._copycmd, gpfn, path)) if s != 0: pilotErrorDiag = "Error during copy: %s" % (o) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) ec = error.ERR_STAGEINFAILED # TODO: how are md5 and size controlled? some processing? return ec, pilotErrorDiag
class Diagnosis(object): # private data members __instance = None # Boolean used by subclasses to become a Singleton __error = PilotErrors() # PilotErrors object # Required methods def __init__(self): """ Default initialization """ # e.g. self.__errorLabel = errorLabel pass
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ copy output file from disk to local SE """ # function is based on dCacheSiteMover put function error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict alt = pdict.get('alt', False) lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') token = pdict.get('token', '') scope = pdict.get('scope', '') dsname = pdict.get('dsname', '') analysisJob = pdict.get('analJob', False) testLevel = pdict.get('testLevel', '0') extradirs = pdict.get('extradirs', '') experiment = pdict.get('experiment', '') logPath = pdict.get('logPath', '') proxycheck = pdict.get('proxycheck', False) prodSourceLabel = pdict.get('prodSourceLabel', '') # get the site information object si = getSiteInformation(experiment) tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel)) if prodSourceLabel == 'ddm' and analysisJob: tolog( "Treating PanDA Mover job as a production job during stage-out" ) analysisJob = False # get the DQ2 tracing report report = self.getStubTracingReport(pdict['report'], 'xrootdObjectstore', lfn, guid) filename = os.path.basename(source) if logPath != "": surl = logPath else: surl = os.path.join(destination, lfn) # get the DQ2 site name from ToA try: _dq2SiteName = self.getDQ2SiteName(surl=surl) except Exception, e: tolog( "Warning: Failed to get the DQ2 site name: %s (can not add this info to tracing report)" % str(e))
def getFinalState(result): """ Figure out the final job state (finished or failed) Simplies job recovery """ state = "failed" # job has failed if transExitCode != 0 if result[1] != 0: state = "failed" else: error = PilotErrors() # job has finished if pilotErrorCode is in the allowed list or recoverable jobs if ((error.isRecoverableErrorCode(result[2])) or (result[2] == error.ERR_KILLSIGNAL and result[0] == "holding")): state = "finished" return state
def prepareOutFiles(outFiles, logFile, workdir, fullpath=False): """ verify and prepare the output files for transfer """ # fullpath = True means that the file in outFiles already has a full path, adding it to workdir is then not needed ec = 0 pilotErrorDiag = "" outs = [] modt = [] from SiteMover import SiteMover for outf in outFiles: if outf and outf != 'NULL': # non-empty string and not NULL path = os.path.join(workdir, outf) if (not os.path.isfile(path) and not fullpath) or (not os.path.isfile(outf) and fullpath): pilotErrorDiag = "Expected output file %s does not exist" % ( path) tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag)) error = PilotErrors() ec = error.ERR_MISSINGOUTPUTFILE break else: tolog("outf = %s" % (outf)) if fullpath: # remove the full path here from outf workdir = os.path.dirname(outf) outf = os.path.basename(outf) outs.append(outf) # get the modification time for the file (needed by NG) modt.append(SiteMover.getModTime(workdir, outf)) tolog("Output file(s):") try: _ec, _rs = commands.getstatusoutput("ls -l %s/%s" % (workdir, outf)) except Exception, e: tolog(str(e)) else: tolog(_rs)
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ Move the file from the current local directory to the local pilot init dir Parameters are: source -- full path of the file in local directory destinaion -- destination SE, method://[hostname[:port]]/full-dir-path/ (NB: no file name) NOT USED (pinitdir is used instead) fsize -- file size of the source file (evaluated if 0) fchecksum -- MD5 checksum of the source file (evaluated if 0) pdict -- to allow additional parameters that may make sense with specific movers Assume that the pilot init dir is locally mounted and its local path is the same as the remote path if both fsize and fchecksum (for the source) are given and !=0 these are assumed without reevaluating them returns: exitcode, pilotErrorDiag, gpfn, fsize, fchecksum """ error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict DN = pdict.get('DN', '') dsname = pdict.get('dsname', '') analJob = pdict.get('analJob', False) sitename = pdict.get('sitename', '') testLevel = pdict.get('testLevel', '0') pilot_initdir = pdict.get('pinitdir', '') experiment = pdict.get('experiment', "ATLAS") # get the site information object si = getSiteInformation(experiment) # are we on a tier 3? if si.isTier3(): outputDir = self.getTier3Path(dsname, DN) tolog("Writing output on a Tier 3 site to: %s" % (outputDir)) # create the dirs if they don't exist try: self.mkdirWperm(outputDir) except Exception, e: tolog("!!WARNING!!2999!! Could not create dir: %s, %s" % (outputDir, str(e)))
class ErrorDiagnosis(Diagnosis): # private data members __instance = None # Boolean used by subclasses to become a Singleton __error = PilotErrors() # PilotErrors object def __init__(self): """ Default initialization """ # e.g. self.__errorLabel = errorLabel pass def __new__(cls, *args, **kwargs): """ Override the __new__ method to make the class a singleton """ if not cls.__instance: cls.__instance = super(ErrorDiagnosis, cls).__new__(cls, *args, **kwargs) return cls.__instance def interpretPayload(self, job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, failureCode): """ Interpret the payload, look for specific errors in the stdout """ # get the experiment object thisExperiment = getExperiment(job.experiment) if not thisExperiment: job.pilotErrorDiag = "ErrorDiagnosis did not get an experiment object from the factory" job.result[ 2] = self.__error.ERR_GENERALERROR # change to better/new error code tolog("!!WARNING!!3234!! %s" % (job.pilotErrorDiag)) return job ### WARNING: EXPERIMENT SPECIFIC, MOVE LATER try: ec, pilotErrorDiag = self.processJobReport(job.workdir) except Exception, e: tolog("!!WARNING!!1114!! Caught exception: %s" % (e)) else: