def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ copy input file from SE to local dir """ error = PilotErrors() # Get input parameters from pdict jobId = pdict.get('jobId', '') workDir = pdict.get('workDir', '') experiment = pdict.get('experiment', '') pandaProxySecretKey = pdict.get('pandaProxySecretKey') jobSetID = pdict.get('jobsetID') # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 's3objectstorepresignedurl', lfn, guid) if path == '': path = './' fullname = os.path.join(path, lfn) # pandaID, filename, jobSetID, pandaProxySecretKey=None, stageIn=True status, output = self.stageIn(jobId, lfn, jobSetID, pandaProxySecretKey, fullname, fsize, fchecksum, experiment) if status == 0: updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", type="input") state = "DONE" else: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" # self.__sendReport(state, report) self.prepareReport(state, report) return status, output
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ copy input file from SE to local dir """ error = PilotErrors() # Get input parameters from pdict jobId = pdict.get('jobId', '') workDir = pdict.get('workDir', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) # try to get the direct reading control variable (False for direct reading mode; file should not be copied) useCT = pdict.get('usect', True) prodDBlockToken = pdict.get('access', '') # get the DQ2 tracing report report = self.getStubTracingReport(pdict['report'], 's3objectstore', lfn, guid) if path == '': path = './' fullname = os.path.join(path, lfn) status, output = self.stageIn(gpfn, fullname, fsize, fchecksum, experiment) if status == 0: updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", type="input") state = "DONE" else: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" self.prepareReport(state, report) return status, output
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ copy output file from disk to local SE """ # function is based on dCacheSiteMover put function error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') token = pdict.get('token', '') scope = pdict.get('scope', '') dsname = pdict.get('dsname', '') experiment = pdict.get('experiment', '') outputDir = pdict.get('outputDir', '') os_bucket_id = pdict.get('os_bucket_id', -1) timeout = pdict.get('timeout', None) if not timeout: timeout = self.timeout # get the site information object si = getSiteInformation(experiment) # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 's3objectstore', lfn, guid) parsed = urlparse.urlparse(destination) scheme = parsed.scheme hostname = parsed.netloc.partition(':')[0] port = int(parsed.netloc.partition(':')[2]) report['remoteSite'] = '%s://%s:%s' % (scheme, hostname, port) filename = os.path.basename(source) surl = destination self.log("surl=%s, timeout=%s" % (surl, timeout)) if "log.tgz" in surl: surl = surl.replace(lfn, "%s:%s"%(scope,lfn)) else: report['eventType'] = 'put_es' status, output, size, checksum = self.stageOut(source, surl, token, experiment, outputDir=outputDir, timeout=timeout, os_bucket_id=os_bucket_id, report=report) if status !=0: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" # self.__sendReport(state, report) self.prepareReport(state, report) return self.put_data_retfail(status, output, surl) state = "DONE" # self.__sendReport(state, report) self.prepareReport(state, report) return 0, pilotErrorDiag, surl, size, checksum, self.arch_type
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ copy output file from disk to local SE """ # function is based on dCacheSiteMover put function error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict alt = pdict.get('alt', False) jobId = pdict.get('jobId', '') jobSetID = pdict.get('jobsetID', '') lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') #token = pdict.get('token', '') scope = pdict.get('scope', '') dsname = pdict.get('dsname', '') analysisJob = pdict.get('analJob', False) testLevel = pdict.get('testLevel', '0') extradirs = pdict.get('extradirs', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) prodSourceLabel = pdict.get('prodSourceLabel', '') outputDir = pdict.get('outputDir', '') timeout = pdict.get('timeout', None) pandaProxySecretKey = pdict.get('pandaProxySecretKey') if not timeout: timeout = self.timeout # get the site information object si = getSiteInformation(experiment) tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel)) # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 's3objectstorepresignedurl', lfn, guid) filename = os.path.basename(source) surl = destination status, output, size, checksum = self.stageOut(source, jobId, lfn, jobSetID, pandaProxySecretKey, experiment, outputDir=outputDir, timeout=timeout) if status !=0: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" # self.__sendReport(state, report) self.prepareReport(state, report) return self.put_data_retfail(status, output, surl) state = "DONE" # self.__sendReport(state, report) # self.prepareReport(state, report) return 0, pilotErrorDiag, surl, size, checksum, self.arch_type
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ copy output file from disk to local SE """ # function is based on dCacheSiteMover put function error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict alt = pdict.get('alt', False) lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') token = pdict.get('token', '') scope = pdict.get('scope', '') dsname = pdict.get('dsname', '') analysisJob = pdict.get('analJob', False) testLevel = pdict.get('testLevel', '0') extradirs = pdict.get('extradirs', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) prodSourceLabel = pdict.get('prodSourceLabel', '') # get the site information object si = getSiteInformation(experiment) tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel)) if prodSourceLabel == 'ddm' and analysisJob: tolog("Treating PanDA Mover job as a production job during stage-out") analysisJob = False # get the DQ2 tracing report report = self.getStubTracingReport(pdict['report'], 's3objectstore', lfn, guid) filename = os.path.basename(source) surl = destination status, output, size, checksum = self.stageOut(source, surl, token, experiment) if status !=0: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" self.prepareReport(state, report) return self.put_data_retfail(status, output, surl) state = "DONE" self.prepareReport(state, report) return 0, pilotErrorDiag, surl, size, checksum, self.arch_type
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ copy input file from SE to local dir """ error = PilotErrors() # Get input parameters from pdict jobId = pdict.get('jobId', '') workDir = pdict.get('workDir', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) os_bucket_id = pdict.get('os_bucket_id', -1) # try to get the direct reading control variable (False for direct reading mode; file should not be copied) useCT = pdict.get('usect', True) prodDBlockToken = pdict.get('access', '') # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 'gfal-copy', lfn, guid) if path == '': path = './' fullname = os.path.join(path, lfn) # get the site information object si = getSiteInformation(experiment) ret_path = si.getCopyPrefixPathNew(gpfn, stageIn=True) if not ret_path.startswith("s3:"): errorLog = "Failed to use copyprefix to convert the current path to S3 path." tolog("!!WARNING!!1777!! %s" % (errorLog)) status = PilotErrors.ERR_STAGEINFAILED state = "PSTAGE_FAIL" output = errorLog else: gpfn = ret_path status, output = self.stageIn(gpfn, fullname, fsize, fchecksum, experiment, os_bucket_id=os_bucket_id) if status == 0: updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", ftype="input") state = "DONE" else: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" self.prepareReport(state, report) return status, output
def getFinalState(result): """ Figure out the final job state (finished or failed) Simplies job recovery """ state = "failed" # job has failed if transExitCode != 0 if result[1] != 0: state = "failed" else: error = PilotErrors() # job has finished if pilotErrorCode is in the allowed list or recoverable jobs if ((error.isRecoverableErrorCode(result[2])) or (result[2] == error.ERR_KILLSIGNAL and result[0] == "holding")): state = "finished" return state
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ copy input file from SE to local dir """ error = PilotErrors() # Get input parameters from pdict jobId = pdict.get('jobId', '') workDir = pdict.get('workDir', '') experiment = pdict.get('experiment', '') os_bucket_id = pdict.get('os_bucket_id', -1) # try to get the direct reading control variable (False for direct reading mode; file should not be copied) useCT = pdict.get('usect', True) prodDBlockToken = pdict.get('access', '') # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 's3objectstore', lfn, guid) if path == '': path = './' fullname = os.path.join(path, lfn) status, output = self.stageIn(gpfn, fullname, fsize, fchecksum, experiment, os_bucket_id=os_bucket_id, report=report) report['eventType'] = 'get_es' parsed = urlparse.urlparse(gpfn) scheme = parsed.scheme hostname = parsed.netloc.partition(':')[0] port = int(parsed.netloc.partition(':')[2]) report['remoteSite'] = '%s://%s:%s' % (scheme, hostname, port) if status == 0: updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", ftype="input") state = "DONE" else: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" # self.__sendReport(state, report) self.prepareReport(state, report) return status, output
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ copy output file from disk to local SE """ # function is based on dCacheSiteMover put function error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict alt = pdict.get('alt', False) lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') token = pdict.get('token', '') scope = pdict.get('scope', '') dsname = pdict.get('dsname', '') analysisJob = pdict.get('analJob', False) testLevel = pdict.get('testLevel', '0') extradirs = pdict.get('extradirs', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) prodSourceLabel = pdict.get('prodSourceLabel', '') # get the site information object si = getSiteInformation(experiment) tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel)) if prodSourceLabel == 'ddm' and analysisJob: tolog("Treating PanDA Mover job as a production job during stage-out") analysisJob = False # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 'gfal-copy', lfn, guid) filename = os.path.basename(source) # get all the proper paths ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths(error, analysisJob, token, prodSourceLabel, dsname, filename, scope=scope, alt=alt, sitemover=self) # quick workaround if ec != 0: self.prepareReport(tracer_error, report) return self.put_data_retfail(ec, pilotErrorDiag) # get local adler32 checksum status, output, adler_size, adler_checksum = self.getLocalFileInfo(source, checksumType="adler32") if status != 0: errorLog = 'Failed to get local file %s adler32 checksum: %s' % (source, output) tolog("!!WARNING!!1777!! %s" % (errorLog)) status = PilotErrors.ERR_STAGEINFAILED state = "PSTAGE_FAIL" output = errorLog self.prepareReport(state, report) return self.put_data_retfail(status, output, surl) ret_path = si.getCopyPrefixPathNew(surl, stageIn=False) tolog("Convert destination: %s to new path: %s" % (surl, ret_path)) if not ret_path.startswith("s3:"): errorLog = "Failed to use copyprefix to convert the current path to S3 path." tolog("!!WARNING!!1777!! %s" % (errorLog)) status = PilotErrors.ERR_STAGEINFAILED state = "PSTAGE_FAIL" output = errorLog size = None checksum = None else: status, output, size, checksum = self.stageOut(source, ret_path, token, experiment) if status !=0: errors = PilotErrors() state = errors.getErrorName(status) if state == None: state = "PSTAGE_FAIL" self.prepareReport(state, report) return self.put_data_retfail(status, output, surl) else: if size == adler_size: tolog("The file size is not changed. Will check whether adler32 changed.") status, output, new_adler_size, new_adler_checksum = self.getLocalFileInfo(source, checksumType="adler32") if status != 0: errorLog = 'Failed to get local file %s adler32 checksum: %s' % (source, output) tolog("!!WARNING!!1777!! %s" % (errorLog)) status = PilotErrors.ERR_STAGEINFAILED state = "PSTAGE_FAIL" output = errorLog self.prepareReport(state, report) return self.put_data_retfail(status, output, surl) else: if adler_checksum == new_adler_checksum: tolog("The file checksum is not changed. Will use adler32 %s to replace the md5 checksum %s" % (adler_checksum, checksum)) checksum = adler_checksum else: errorLog = "The file checksum changed from %s(before transfer) to %s(after transfer)" % (adler_checksum, new_adler_checksum) tolog("!!WARNING!!1777!! %s" % (errorLog)) status = PilotErrors.ERR_STAGEINFAILED state = "PSTAGE_FAIL" output = errorLog self.prepareReport(state, report) return self.put_data_retfail(status, output, surl) state = "DONE" self.prepareReport(state, report) return 0, pilotErrorDiag, surl, size, checksum, self.arch_type
class ATLASSiteInformation(SiteInformation): # private data members __experiment = "ATLAS" __instance = None __error = PilotErrors() # PilotErrors object __securityKeys = {} __benchmarks = None # Required methods def __init__(self): """ Default initialization """ pass def __new__(cls, *args, **kwargs): """ Override the __new__ method to make the class a singleton """ if not cls.__instance: cls.__instance = super(ATLASSiteInformation, cls).__new__(cls, *args, **kwargs) return cls.__instance def getExperiment(self): """ Return a string with the experiment name """ return self.__experiment def isTier1(self, sitename): """ Is the given site a Tier-1? """ # E.g. on a Tier-1 site, the alternative stage-out algorithm should not be used # Note: sitename is PanDA sitename, not Rucio sitename (RSE) status = False for cloud in self.getCloudList(): if sitename in self.getTier1List(cloud): status = True break return status def isTier2(self, sitename): """ Is the given site a Tier-2? """ # Logic: it is a T2 if it is not a T1 or a T3 return (not (self.isTier1(sitename) or self.isTier3())) def isTier3(self): """ Is the given site a Tier-3? """ # Note: defined by DB if readpar('ddm') == "local": status = True else: status = False return status def getCloudList(self): """ Return a list of all clouds """ tier1 = self.setTier1Info() return tier1.keys() def setTier1Info(self): """ Set the Tier-1 information """ tier1 = {"CA": ["TRIUMF", ""], "CERN": ["CERN-PROD", ""], "DE": ["FZK-LCG2", ""], "ES": ["pic", ""], "FR": ["IN2P3-CC", ""], "IT": ["INFN-T1", ""], "ND": ["ARC", ""], "NL": ["SARA-MATRIX", ""], "OSG": ["BNL_CVMFS_1", ""], "RU": ["RRC-KI-T1", ""], "TW": ["Taiwan-LCG2", ""], "UK": ["RAL-LCG2", ""], "US": ["BNL_PROD", "BNL_PROD-condor"] } return tier1 def getTier1Name(self, cloud): """ Return the the site name of the Tier 1 """ return self.getTier1List(cloud)[0] def getTier1List(self, cloud): """ Return a Tier 1 site/queue list """ # Cloud : PanDA site, queue tier1 = self.setTier1Info() return tier1[cloud] def getTier1InfoFilename(self): """ Get the Tier-1 info file name """ filename = "Tier-1_info.%s" % (getExtension()) path = "%s/%s" % (os.environ['PilotHomeDir'], filename) return path def downloadTier1Info(self): """ Download the Tier-1 info file """ ec = 0 path = self.getTier1InfoFilename() filename = os.path.basename(path) dummy, extension = os.path.splitext(filename) # url = "http://adc-ssb.cern.ch/SITE_EXCLUSION/%s" % (filename) if extension == ".json": _cmd = "?json" # _cmd = "?json&preset=ssbpilot" else: _cmd = "?preset=ssbpilot" url = "http://atlas-agis-api.cern.ch/request/site/query/list/%s" % (_cmd) cmd = 'curl --connect-timeout 20 --max-time 120 -sS "%s" > %s' % (url, path) if os.path.exists(path): tolog("File %s already available" % (path)) else: tolog("Will download file: %s" % (filename)) try: tolog("Executing command: %s" % (cmd)) ret, output = commands.getstatusoutput(cmd) except Exception, e: tolog("!!WARNING!!1992!! Could not download file: %s" % (e)) ec = -1 else:
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ The local file is assubed to have a relative path that is the same of the relative path in the 'gpfn' loc_... are the variables used to access the file in the locally exported file system """ error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict useCT = pdict.get('usect', True) jobId = pdict.get('jobId', '') workDir = pdict.get('workDir', '') prodDBlockToken = pdict.get('access', '') # get the DQ2 tracing report try: report = pdict['report'] except: report = {} else: # set the proper protocol report['protocol'] = 'rfcpLFC' # mark the relative start report['relativeStart'] = time() # the current file report['filename'] = lfn # guid report['guid'] = guid.replace('-', '') tolog("gpfn is %s" % gpfn) # get a proper envsetup envsetup = self.getEnvsetup(get=True) if self._setup: _setup_str = "source %s; " % self._setup else: _setup_str = envsetup ec, pilotErrorDiag = verifySetupCommand(error, _setup_str) if ec != 0: self.prepareReport('RFCP_FAIL', report) return ec, pilotErrorDiag # remove any host and SFN info from PFN path loc_pfn = self.extractPathFromPFN(gpfn) try: if not loc_pfn.startswith(('/dpm', '/castor')): tolog( "Potential problem with local filename. Does not start with '/dpm' or '/castor/'." ) except TypeError: # Older version of python pass # should the root file be copied or read directly by athena? directIn, useFileStager = self.getTransferModes() if directIn: if useCT: directIn = False tolog( "Direct access mode is switched off (file will be transferred with the copy tool)" ) updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", type="input") else: # determine if the file is a root file according to its name rootFile = self.isRootFileName(lfn) if prodDBlockToken == 'local' or not rootFile: directIn = False tolog( "Direct access mode has been switched off for this file (will be transferred with the copy tool)" ) updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", type="input") elif rootFile: tolog( "Found root file according to file name: %s (will not be transferred in direct reading mode)" % (lfn)) report['relativeStart'] = None report['transferStart'] = None self.prepareReport('FOUND_ROOT', report) if useFileStager: updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="file_stager", type="input") else: updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="remote_io", type="input") return error.ERR_DIRECTIOFILE, pilotErrorDiag else: tolog("Normal file transfer") dest_path = os.path.join(path, lfn) #PN _cmd_str = '%srfcp %s %s' % (_setup_str, loc_pfn, dest_path) # if ".lib." in loc_pfn: # _cmd_str = '%srfcp %s %s' % (_setup_str, loc_pfn, dest_path) # else: # _cmd_str = '%srfcpXXX %s %s' % (_setup_str, loc_pfn, dest_path) tolog("Executing command: %s" % (_cmd_str)) report['transferStart'] = time() # execute timeout = 3600 try: s, telapsed, cout, cerr = timed_command(_cmd_str, timeout) except Exception, e: pilotErrorDiag = 'timed_command() threw an exception: %s' % (e) tolog("!!WARNING!!1111!! %s" % (pilotErrorDiag)) s = 1 o = str(e) telapsed = timeout
def put_data(self, source, ddm_storage, fsize=0, fchecksum=0, dsname='', **pdict): """ Data transfer using rfcp - generic version It's not advisable to use this right now because there's no easy way to register the srm space token if the file is copied with rfcp""" error = PilotErrors() pilotErrorDiag = "" tolog("put_data() got ddm_storage=%s" % (ddm_storage)) # Get input parameters from pdict lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') dsname = pdict.get('dsname', '') analJob = pdict.get('analJob', False) extradirs = pdict.get('extradirs', '') if self._setup: _setup_str = "source %s; " % self._setup else: _setup_str = '' # get the DQ2 tracing report try: report = pdict['report'] except: report = {} else: # set the proper protocol report['protocol'] = 'rfpLFC' # mark the relative start report['relativeStart'] = time() # the current file report['filename'] = lfn # guid report['guid'] = guid.replace('-', '') # At destination append a subdirectory which is first two fields of dsname, or 'other' destination = readpar('sepath') if destination == '': pilotErrorDiag = "put_data destination path in SE not defined" tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag)) self.prepareReport('DEST_PATH_UNDEF', report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag) if dsname == '': pilotErrorDiag = "Dataset name not specified to put_data" tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag)) self.prepareReport('DSN_UNDEF', report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag) # else: # dsname = self.remove_sub(dsname) # tolog("dsname: %s" % (dsname)) # report['dataset'] = dsname pat = re.compile('([^\.]+\.[^\.]+)\..*') mat = pat.match(dsname) if mat: prefixdir = mat.group(1) destination = os.path.join(destination, prefixdir) else: pilotErrorDiag = "put_data encountered unexpected dataset name format: %s" % ( dsname) tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag)) self.prepareReport('DSN_FORMAT_FAIL', report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag) # preparing variables src_pfn = source if fsize == 0 or fchecksum == 0: ec, pilotErrorDiag, fsize, fchecksum = self.getLocalFileInfo( src_pfn, csumtype="adler32") if ec != 0: self.prepareReport('LOCAL_FILE_INFO_FAIL', report) return SiteMover.SiteMover.put_data_retfail(ec, pilotErrorDiag) # now that the file size is known, add it to the tracing report report['filesize'] = fsize dst_se = destination if ( dst_se.find('SFN') != -1 ): # srm://dcsrm.usatlas.bnl.gov:8443/srm/managerv1?SFN=/pnfs/usatlas.bnl.gov/ s = dst_se.split('SFN=') dst_loc_se = s[1] dst_prefix = s[0] + 'SFN=' else: _sentries = dst_se.split('/', 3) dst_serv = _sentries[0] + '//' + _sentries[ 2] # 'method://host:port' is it always a ftp server? can it be srm? something else? dst_host = _sentries[2] #host and port dst_loc_se = '/' + _sentries[3] dst_prefix = dst_serv filename = os.path.basename(source) # Behavior as in BNL: user files have no dsname automatically added to dir name m = re.search('^user', filename) if m: dsname = '' dst_loc_sedir = os.path.join(dst_loc_se, os.path.join(extradirs, dsname)) copyprefix = readpar('copyprefix') tolog('copyprefix: %s' % (copyprefix)) if copyprefix != '': # Replace prefix on pfn pfrom, pto = copyprefix.split('^') tolog("Replacing %s with %s on %s" % (pfrom, pto, dst_loc_sedir)) dst_loc_sedir = dst_loc_sedir.replace(pfrom, pto) dst_loc_pfn = os.path.join(dst_loc_sedir, filename) dst_gpfn = dst_prefix + dst_loc_pfn # get the DQ2 site name from ToA try: _dq2SiteName = self.getDQ2SiteName(surl=dst_gpfn) except Exception, e: tolog( "Warning: Failed to get the DQ2 site name: %s (can not add this info to tracing report)" % str(e))
# JEM job-end callback try: from JEMstub import notifyJobEnd2JEM notifyJobEnd2JEM(job, tolog) except: pass # don't care (fire and forget) return res, job, getstatusoutput_was_interrupted, current_job_number if __name__ == "__main__": tolog("Starting RunJobHopper") # Get error handler error = PilotErrors() # Get runJob object runJob = RunJobHopper() # Setup HPC specific parameters for Edison runJob.cpu_number_per_node = 24 runJob.walltime = 120 runJob.max_nodes = 10 runJob.number_of_threads = 1 runJob.min_walltime = 10 # minutes runJob.waittime = 15 # minutes runJob.nodes = 2 runJob.partition_comp = 'hopper' runJob.project_id = ""
class LocalSiteMover(SiteMover.SiteMover): """ SiteMover that uses lsm for both get and put """ # no registration is done copyCommand = "lsm" realCopyCommand = "lsm-get" checksum_command = "adler32" timeout = 3600 __warningStr = '!!WARNING!!2995!! %s' __spacetoken = '-t %s' # space token descriptor __localget = '%s lsm-get %s %s %s' # environment, options, lfn, target directory __localput = '%s lsm-put %s %s %s' # environment, space token (optional), source directory, destination __localputBAD = '%s lsm-put %s %s %s' # environment, space token (optional), source directory, destination __localspace = '%s lsm-df %s %s' # environment, space token (optional), storage end-point __par_filesize = ' --size %s' # filesize in bytes __par_checksum = ' --checksum %s' # checksum string: "adler32:NNN", "md5:NNN", default is assumed MD5 __timeout = 5400 # seconds __error = PilotErrors() __pilotErrorDiag = '' def __init__(self, setup_path, *args, **kwrds): self._setup = setup_path.strip() self.__isSetuped = False self._defaultSetup = None def get_timeout(self): return self.timeout def log(self, errorLog): tolog(errorLog) def getSetup(self): """ Return the setup string (pacman setup os setup script) for the copy command used by the mover """ _setup_str = "" self._setup = self._setup.strip() tolog("self setup: %s" % self._setup) if self._setup and self._setup != "" and self._setup.strip() != "": if not self._setup.endswith(";"): self._setup += ";" if not "alias" in self._setup: if "atlasLocalSetup.sh" in self._setup and "--quiet" not in self._setup: self._setup = self._setup.replace( "atlasLocalSetup.sh", "atlasLocalSetup.sh --quiet") if self._setup.startswith("export") or self._setup.startswith( "source"): _setup_str = "%s" % self._setup else: _setup_str = "source %s" % self._setup else: _setup_str = self._setup if _setup_str != "": tolog("Using setup: %s" % (_setup_str)) return _setup_str def verifySetupCommand(self, _setupStr): """ Make sure the setup command exists """ statusRet = 0 outputRet = {} outputRet["errorLog"] = None outputRet["report"] = {} outputRet["report"]["clientState"] = None # remove any '-signs _setupStr = _setupStr.replace("'", "") self.log("Will verify: %s" % (_setupStr)) if _setupStr != "" and "source " in _setupStr: # first extract the file paths from the source command(s) setupPaths = extractFilePaths(_setupStr) # only run test if string begins with an "/" if setupPaths: # verify that the file paths actually exists for setupPath in setupPaths: if "-" in setupPath: continue if os.path.exists(setupPath): self.log("File %s has been verified" % (setupPath)) else: outputRet[ "errorLog"] = errorLog = "No such file or directory: %s" % ( setupPath) self.log('!!WARNING!!2991!! %s' % (errorLog)) statusRet = PilotErrors.ERR_NOSUCHFILE break else: # nothing left to test pass else: self.log( "Nothing to verify in setup: %s (either empty string or no source command)" % (_setupStr)) return statusRet, outputRet def verifySetupProxy(self, _setupStr, experiment): #check do we have a valid proxy # get the experiment object thisExperiment = getExperiment(experiment) status, output = thisExperiment.verifyProxy(envsetup=_setupStr) return status, output def verifySetup(self, _setupStr, experiment, proxycheck=True): statusRet, outputRet = self.verifySetupCommand(_setupStr) if statusRet != 0: #self.prepareReport('RFCP_FAIL', self._variables['report']) outputRet["report"]["clientState"] = "RFCP_FAIL" return statusRet, outputRet command = _setupStr if command != "" and not command.endswith(';'): command = command + ";" command += " which " + self.realCopyCommand status, output = commands.getstatusoutput(command) self.log("Execute command: %s" % command) self.log("Status: %s, Output: %s" % (status, output)) if status != 0: self.log(self.copyCommand + " is not found in envsetup: " + _setupStr) #self.prepareReport('RFCP_FAIL', self._variables['report']) outputRet["report"]["clientState"] = "RFCP_FAIL" outputRet["errorLog"] = output return status, outputRet if proxycheck: status, outputLog = self.verifySetupProxy(_setupStr, experiment) if status != 0: outputRet["errorLog"] = outputLog outputRet["report"]["clientState"] = 'PROXYFAIL' return status, outputRet return status, outputRet def setup(self, experiment): """ setup env """ if self.__isSetuped: return 0, None thisExperiment = getExperiment(experiment) self.useTracingService = thisExperiment.useTracingService() _setupStr = self.getSetup() # get the user proxy if available envsetupTest = _setupStr.strip() if envsetupTest != "" and not envsetupTest.endswith(';'): envsetupTest += ";" if os.environ.has_key('X509_USER_PROXY'): envsetupTest += " export X509_USER_PROXY=%s;" % ( os.environ['X509_USER_PROXY']) self.log("to verify site setup: %s " % envsetupTest) status, output = self.verifySetup(envsetupTest, experiment) self.log("site setup verifying: status: %s, output: %s" % (status, output["errorLog"])) if status == 0: self._setup = envsetupTest self.__isSetuped = True return status, output else: if self._defaultSetup: #try to use default setup self.log("Try to use default envsetup") envsetupTest = self._defaultSetup.strip() if envsetupTest != "" and not envsetupTest.endswith(';'): envsetupTest += ";" if os.environ.has_key('X509_USER_PROXY'): envsetupTest += " export X509_USER_PROXY=%s;" % ( os.environ['X509_USER_PROXY']) self.log("verify default setup: %s " % envsetupTest) status, output = self.verifySetup(envsetupTest, experiment) self.log("default setup verifying: status: %s, output: %s" % (status, output["errorLog"])) if status == 0: self._setup = envsetupTest self.__isSetuped = True return status, output return status, output def fixStageInPath(self, path): """Fix the path""" if path[:3] == "srm" and '?SFN=' in path: self.log("Found SFN part in file path: %s" % (path)) elif path[:3] == "srm": try: hostname = path.split('/', 3)[2] except Exception as e: self.log( "'!!WARNING!!2999!! Could not extract srm protocol for replacement, keeping path variable as it is: %s (%s)' %\ (path, str(e))") else: # srm = 'srm://head01.aglt2.org' srm = 'srm://' + hostname # does seopt contain any matching srm's? sematch = self.getSEMatchFromSEOpt(srm) if sematch != "": path = path.replace(srm, sematch) self.log("Replaced %s with %s (from seopt) in path: %s" % (srm, sematch, path)) else: se = readpar('se').split(",")[0] _dummytoken, se = self.extractSE(se) tolog("Using SE: %s" % (se)) path = path.replace(srm, se) self.log("Replaced %s with %s (from se) in path: %s" % (srm, se, path)) # add port number from se to getfile if necessary path = self.addPortToPath(se, path) return path def getStageInMode(self, lfn, prodDBlockToken, transferType): # should the root file be copied or read directly by athena? status = 0 output = {} output["errorLog"] = None output["report"] = {} output["report"]["clientState"] = None output["transfer_mode"] = None isRootFileName = self.isRootFileName(lfn) siteInformation = SiteInformation() directIn, transfer_mode = siteInformation.getDirectInAccessMode( prodDBlockToken, isRootFileName, transferType) if transfer_mode: output["transfer_mode"] = transfer_mode if directIn: output["report"]["clientState"] = 'FOUND_ROOT' output["report"]['relativeStart'] = None output["report"]['transferStart'] = None return PilotErrors.ERR_DIRECTIOFILE, output return 0, output def stageInFile(self, source, destination, sourceSize, sourceChecksum, guid=None): """StageIn the file. should be implementated by different site mover.""" statusRet = 0 outputRet = {} outputRet["errorLog"] = None outputRet["report"] = {} outputRet["report"]["clientState"] = None # build the parameters _params = "" if sourceSize != 0 and sourceSize != "0": _params += self.__par_filesize % (sourceSize) if sourceChecksum and sourceChecksum != 'None' and sourceChecksum != 0 and sourceChecksum != "0" and not self.isDummyChecksum( sourceChecksum): csumtype = self.getChecksumType(sourceChecksum) # special case for md5sum (command only understands 'md5' and 'adler32', and not 'ad' and 'md5sum') if csumtype == 'md5sum': csumtype = 'md5' _params += self.__par_checksum % ("%s:%s" % (csumtype, sourceChecksum), ) # add the guid option _params += " --guid %s" % (guid) self.log("StageIn files started.") _cmd_str = self.__localget % (self._setup, _params, source, destination) self.log('Executing command: %s' % (_cmd_str)) s = -1 o = '(not defined)' t0 = os.times() outputRet["report"]['relativeStart'] = time() outputRet["report"]['transferStart'] = time() try: timerCommand = TimerCommand(_cmd_str) s, o = timerCommand.run(timeout=self.timeout) except Exception, e: tolog("!!WARNING!!2990!! Exception caught by stageInFile(): %s" % (str(e))) o = str(e) t1 = os.times() t = t1[4] - t0[4] self.log("Command finished after %f s: %s" % (t, o.replace('\n', ' '))) if s == 0: self.log("Stagein succeeded") else: self.log("!!WARNING!!2990!! Command failed: %s" % (_cmd_str)) o = o.replace('\n', ' ') #check_syserr(s, o) self.log("!!WARNING!!2990!! get_data failed. Status=%s Output=%s" % (s, str(o))) # remove the local file before any get retry is attempted _status = self.removeLocal(destination) if not _status: self.log( "!!WARNING!!1112!! Failed to remove local file, get retry will fail" ) status, output = self.errorToReport(o, t, source, stageMethod="stageIN") return status, output #outputRet["report"]["clientState"] = "DONE" return statusRet, outputRet
class OtherExperiment(Experiment): # private data members __experiment = "Other" __instance = None __error = PilotErrors() # PilotErrors object __doFileLookups = False # True for LFC based file lookups (basically a dummy data member here since singleton object is static) __cache = "" # Cache URL used e.g. by LSST # Required methods def __init__(self): """ Default initialization """ # e.g. self.__errorLabel = errorLabel pass def __new__(cls, *args, **kwargs): """ Override the __new__ method to make the class a singleton """ if not cls.__instance: cls.__instance = super(OtherExperiment, cls).__new__(cls, *args, **kwargs) return cls.__instance def getExperiment(self): """ Return a string with the experiment name """ return self.__experiment def setParameters(self, *args, **kwargs): """ Set any internally needed variables """ # set initial values self.__job = kwargs.get('job', None) if self.__job: self.__analysisJob = isAnalysisJob(self.__job.trf) def getJobExecutionCommand(self): """ Define and test the command(s) that will be used to execute the payload """ # E.g. cmd = "source <path>/setup.sh; <path>/python " cmd = "" return cmd def willDoFileLookups(self): """ Should (LFC) file lookups be done by the pilot or not? """ return False def willDoFileRegistration(self): """ Should (LFC) file registration be done by the pilot or not? """ return False def doFileLookups(self, doFileLookups): """ Update the file lookups boolean """ # Only implement this method if class really wants to update the __doFileLookups boolean # ATLAS wants to implement this, but not CMS # Method is used by Mover # self.__doFileLookups = doFileLookups pass def isOutOfMemory(self, **kwargs): """ Try to identify out of memory errors in the stderr/out """ return False def getNumberOfEvents(self, **kwargs): """ Return the number of events """ return 0 def specialChecks(self, **kwargs): """ Implement special checks here """ # Return False if fatal failure, otherwise return True # The pilot will abort if this method returns a False status = False tolog("No special checks for \'%s\'" % (self.__experiment)) return True # obviously change this to 'status' once implemented # Optional def setCache(self, cache): """ Cache URL """ # Used e.g. by LSST self.__cache = cache # Optional def getCache(self): """ Return the cache URL """ # Used e.g. by LSST return self.__cache # Optional def useTracingService(self): """ Use the DQ2 Tracing Service """ # A service provided by the DQ2 system that allows for file transfer tracking; all file transfers # are reported by the pilot to the DQ2 Tracing Service if this method returns True return False
class ChirpSiteMover(SiteMover.SiteMover): """ SiteMover for CHIRP copy commands etc """ copyCommand = "chirp" checksum_command = "adler32" __warningStr = '!!WARNING!!2995!! %s' __chirp = 'chirp -t 300 %s %s < %s' # options,server, command file __timeout = 300 # seconds __error = PilotErrors() __pilotErrorDiag = '' __MAX_FILE_SIZE = 200*1024**2 def get_timeout(self): return self.__timeout def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ copy input file from SE to local dir """ # try to get the direct reading control variable (False for direct reading mode; file should not be copied) useCT = pdict.get('usect', True) prodDBlockToken = pdict.get('access', '') # get the Rucio tracing report try: report = pdict['report'] except: report = {} else: # set the proper protocol report['protocol'] = 'local' # mark the relative start report['relativeStart'] = time() # the current file report['filename'] = lfn # guid report['guid'] = guid.replace('-','') if not path: tolog('path is empty, using current directory') path = os.getcwd() # build setup string envsetup = self.getEnvsetup(get=True) # should the root file be copied or read directly by athena? directIn = False dInfo = getDirectAccessDic(readpar('copysetupin')) # if copysetupin did not contain direct access info, try the copysetup instead if not dInfo: dInfo = getDirectAccessDic(readpar('copysetup')) tolog("dInfo: %s" % str(dInfo)) # check if we should use the copytool if dInfo: directIn = dInfo['directIn'] if directIn: if useCT: directIn = False tolog("Direct access mode is switched off (file will be transferred with the copy tool)") else: # determine if the file is a root file according to its name rootFile = self.isRootFileName(lfn) if prodDBlockToken == 'local' or not rootFile: directIn = False tolog("Direct access mode has been switched off for this file (will be transferred with the copy tool)") elif rootFile: tolog("Found root file according to file name: %s (will not be transferred in direct reading mode)" % (lfn)) report['relativeStart'] = None report['transferStart'] = None self.prepareReport('FOUND_ROOT', report) return 0, self.__pilotErrorDiag else: tolog("Normal file transfer") else: tolog("not directIn") # build the get command _params = "" if fchecksum and fchecksum != 'None' and fchecksum != 0 and fchecksum != "0" and not self.isDummyChecksum(fchecksum): csumtype = self.getChecksumType(fchecksum) # special case for md5sum (command only understands 'md5' and 'adler32', and not 'ad' and 'md5sum') if csumtype == 'md5sum': csumtype = 'md5' execStr = self.__localget % (envsetup, _params, gpfn, os.path.join(path, lfn)) tolog("Executing command: %s" % (execStr)) report['transferStart'] = time() try: status, telapsed, cout, cerr = timed_command(execStr, self.__timeout) except Exception, e: self.__pilotErrorDiag = 'timed_command() threw an exception: %s' % str(e) tolog(self.__warningStr % self.__pilotErrorDiag) status = 1 output = str(e) telapsed = self.__timeout else:
node['exeErrorCode'] = job.exeErrorCode else: node['transExitCode'] = job.result[1] if (job.result[0] == 'failed') and (job.exeErrorCode != 0) and (job.result[1] != job.exeErrorCode): if log: mismatch = "MISMATCH | Trf error code mismatch: exeErrorCode = %d, transExitCode = %d" %\ (job.exeErrorCode, job.result[1]) if node.has_key('pilotLog'): node['pilotLog'] = mismatch + node['pilotLog'] else: tolog("!!WARNING!!1300!! Could not write mismatch error to log extracts: %s" % mismatch) # check if Pilot-controlled resubmission is required: if (job.result[0] == "failed" and 'ANALY' in site.sitename): pilotExitCode = job.result[2] error = PilotErrors() if (error.isPilotResubmissionErrorCode(pilotExitCode) or job.isPilotResubmissionRequired): # negate PilotError, ensure it's negative job.result[2] = -abs(pilotExitCode) tolog("(Negated error code)") else: tolog("(No need to negate error code)") node['pilotErrorCode'] = job.result[2] tolog("Pilot error code: %d" % (node['pilotErrorCode'])) # report CPUTime and CPUunit at the end of the job node['cpuConsumptionTime'] = job.cpuConsumptionTime try: node['cpuConsumptionUnit'] = job.cpuConsumptionUnit + "+" + getCPUmodel() except:
def getNodeStructure(self, job, site, workerNode, spaceReport=False, log=None): """ define the node structure expected by the server """ node = {} node['node'] = workerNode.nodename node['workdir'] = job.workdir node['siteName'] = site.sitename node['jobId'] = job.jobId node['state'] = job.result[0] node['timestamp'] = timeStamp() if job.attemptNr > -1: node['attemptNr'] = job.attemptNr if self.__jobSchedulerId: node['schedulerID'] = self.__jobSchedulerId if self.__pilotId: # report the batch system job id, if available batchSystemType, _id = getBatchSystemJobID() if batchSystemType: tolog("Batch system: %s" % (batchSystemType)) tolog("Batch system job ID: %s" % (_id)) node['pilotID'] = "%s|%s|%s|%s|%s" % (self.__pilotId, _id, batchSystemType, self.__pilot_version_tag, self.__pilot_version) node['batchID'] = _id tolog("Will send batchID: %s and pilotID: %s" % (node['batchID'], node['pilotID'])) else: tolog("Batch system type was not identified (will not be reported)") node['pilotID'] = "%s|%s|%s" % (self.__pilotId, self.__pilot_version_tag, self.__pilot_version) tolog("Will send pilotID: %s" % (node['pilotID'])) tolog("pilotId: %s" % str(self.__pilotId)) if log and (job.result[0] == 'failed' or job.result[0] == 'holding' or "outbound connections" in log): node['pilotLog'] = log # build the jobMetrics node['jobMetrics'] = self.getJobMetrics(job, workerNode) # send pilotErrorDiag for finished, failed and holding jobs if job.result[0] == 'finished' or job.result[0] == 'failed' or job.result[0] == 'holding': # get the pilot error diag if job.pilotErrorDiag: if job.pilotErrorDiag == "": node['pilotErrorDiag'] = tailPilotErrorDiag(self.__error.getPilotErrorDiag(job.result[2])) job.pilotErrorDiag = node['pilotErrorDiag'] tolog("Empty pilotErrorDiag set to: %s" % (job.pilotErrorDiag)) elif job.pilotErrorDiag.upper().find("<HTML>") >= 0: tolog("Found html in pilotErrorDiag: %s" % (job.pilotErrorDiag)) node['pilotErrorDiag'] = self.__error.getPilotErrorDiag(job.result[2]) job.pilotErrorDiag = node['pilotErrorDiag'] tolog("Updated pilotErrorDiag: %s" % (job.pilotErrorDiag)) else: # truncate if necesary if len(job.pilotErrorDiag) > 250: tolog("pilotErrorDiag will be truncated to size 250") tolog("Original pilotErrorDiag message: %s" % (job.pilotErrorDiag)) job.pilotErrorDiag = job.pilotErrorDiag[:250] # set the pilotErrorDiag, but only the last 256 characters node['pilotErrorDiag'] = tailPilotErrorDiag(job.pilotErrorDiag) else: # set the pilotErrorDiag, but only the last 256 characters job.pilotErrorDiag = self.__error.getPilotErrorDiag(job.result[2]) node['pilotErrorDiag'] = tailPilotErrorDiag(job.pilotErrorDiag) tolog("Updated pilotErrorDiag from None: %s" % (job.pilotErrorDiag)) # get the number of events if job.nEvents != 0: node['nEvents'] = job.nEvents tolog("Total number of processed events: %d (read)" % (job.nEvents)) else: tolog("runJob did not report on the total number of read events") if job.result[0] == 'finished' or job.result[0] == 'failed': # make sure there is no mismatch between the transformation error codes (when both are reported) # send transformation errors depending on what is available if job.exeErrorDiag != "": node['exeErrorCode'] = job.exeErrorCode node['exeErrorDiag'] = job.exeErrorDiag else: node['transExitCode'] = job.result[1] if (job.result[0] == 'failed') and (job.exeErrorCode != 0) and (job.result[1] != job.exeErrorCode): if log: mismatch = "MISMATCH | Trf error code mismatch: exeErrorCode = %d, transExitCode = %d" %\ (job.exeErrorCode, job.result[1]) if node.has_key('pilotLog'): node['pilotLog'] = mismatch + node['pilotLog'] else: tolog("!!WARNING!!1300!! Could not write mismatch error to log extracts: %s" % mismatch) # check if Pilot-controlled resubmission is required: if (job.result[0] == "failed" and 'ANALY' in site.sitename): pilotExitCode = job.result[2] error = PilotErrors() if (error.isPilotResubmissionErrorCode(pilotExitCode) or job.isPilotResubmissionRequired): # negate PilotError, ensure it's negative job.result[2] = -abs(pilotExitCode) tolog("(Negated error code)") else: tolog("(No need to negate error code)") node['pilotErrorCode'] = job.result[2] tolog("Pilot error code: %d" % (node['pilotErrorCode'])) # report CPUTime and CPUunit at the end of the job node['cpuConsumptionTime'] = job.cpuConsumptionTime try: node['cpuConsumptionUnit'] = job.cpuConsumptionUnit + "+" + getCPUmodel() except: node['cpuConsumptionUnit'] = '?' node['cpuConversionFactor'] = job.cpuConversionFactor # report specific time measures # node['pilotTiming'] = "getJob=%s setup=%s stageIn=%s payload=%s stageOut=%s" % (job.timeGetJob, job.timeSetup, job.timeStageIn, job.timeExe, job.timeStageOut) node['pilotTiming'] = "%s|%s|%s|%s|%s" % (job.timeGetJob, job.timeStageIn, job.timeExe, job.timeStageOut, job.timeSetup) # node['pilotTiming'] = "%s|%s|%s|%s|%s" % (str(job.timeGetJob), str(job.timeStageIn), str(job.timeExe), str(job.timeStageOut), str(job.timeSetup)) elif job.result[0] == 'holding': node['exeErrorCode'] = job.result[2] node['exeErrorDiag'] = self.__error.getPilotErrorDiag(job.result[2]) else: node['cpuConsumptionUnit'] = getCPUmodel() if spaceReport and site.dq2space != -1: # non-empty string and the space check function runs well node['remainingSpace'] = site.dq2space node['messageLevel'] = site.dq2spmsg return node