def stageInFile(self, source, destination): """StageIn the file. should be implementated by different site mover.""" statusRet = 0 outputRet = {} outputRet["errorLog"] = None outputRet["report"] = {} outputRet["report"]["clientState"] = None self.log("StageIn files started.") _cmd_str = '%s xrdcp -np %s %s' % (self._setup, source, destination) # update job setup script thisExperiment = getExperiment(self.__experiment) # add the full stage-out command to the job setup script to_script = _cmd_str.replace(destination, "`pwd`/%s" % os.path.basename(destination)) to_script = to_script.lstrip(' ') # remove any initial spaces if to_script.startswith('/'): to_script = 'source ' + to_script thisExperiment.updateJobSetupScript(os.path.dirname(destination), to_script=to_script) self.log('Executing command: %s' % (_cmd_str)) s = -1 o = '(not defined)' t0 = os.times() outputRet["report"]['relativeStart'] = time() outputRet["report"]['transferStart'] = time() try: timerCommand = TimerCommand(_cmd_str) s, o = timerCommand.run(timeout=self.timeout) except Exception, e: tolog("!!WARNING!!2990!! Exception caught by stageInFile(): %s" % (str(e))) o = str(e)
def stageInFile(self, source, destination): """StageIn the file. should be implementated by different site mover.""" statusRet = 0 outputRet = {} outputRet["errorLog"] = None outputRet["report"] = {} outputRet["report"]["clientState"] = None self.log("StageIn files started.") _cmd_str = '%s xrdcp -np %s %s' % (self._setup, source, destination) # update job setup script thisExperiment = getExperiment(self.__experiment) # add the full stage-out command to the job setup script to_script = _cmd_str.replace( destination, "`pwd`/%s" % os.path.basename(destination)) to_script = to_script.lstrip(' ') # remove any initial spaces if to_script.startswith('/'): to_script = 'source ' + to_script thisExperiment.updateJobSetupScript(os.path.dirname(destination), to_script=to_script) self.log('Executing command: %s' % (_cmd_str)) s = -1 o = '(not defined)' t0 = os.times() outputRet["report"]['relativeStart'] = time() outputRet["report"]['transferStart'] = time() try: timerCommand = TimerCommand(_cmd_str) s, o = timerCommand.run(timeout=self.timeout) except Exception, e: tolog("!!WARNING!!2990!! Exception caught by stageInFile(): %s" % (str(e))) o = str(e)
def setupHPCEvent(self): self.__jobSite = Site.Site() self.__jobSite.setSiteInfo(self.argumentParser()) ## For HPC job, we don't need to reassign the workdir # reassign workdir for this job self.__jobSite.workdir = self.__jobSite.wntmpdir if not os.path.exists(self.__jobSite.workdir): os.makedirs(self.__jobSite.workdir) tolog("runJobHPCEvent.getPilotLogFilename=%s" % self.getPilotLogFilename()) if self.getPilotLogFilename() != "": pUtil.setPilotlogFilename(self.getPilotLogFilename()) # set node info self.__node = Node.Node() self.__node.setNodeName(os.uname()[1]) self.__node.collectWNInfo(self.__jobSite.workdir) # redirect stderr #sys.stderr = open("%s/runJobHPCEvent.stderr" % (self.__jobSite.workdir), "w") tolog("Current job workdir is: %s" % os.getcwd()) tolog("Site workdir is: %s" % self.__jobSite.workdir) # get the experiment object self.__thisExperiment = getExperiment(self.getExperiment()) tolog("runEvent will serve experiment: %s" % (self.__thisExperiment.getExperiment()))
def extractJobInformation(self, job, runCommandList): """ Extract relevant job information, e.g. number of events """ # get the experiment object thisExperiment = getExperiment(job.experiment) if not thisExperiment: job.pilotErrorDiag = "ErrorDiagnosis did not get an experiment object from the factory" job.result[ 2] = self.__error.ERR_GENERALERROR # change to better/new error code tolog("!!WARNING!!3234!! %s" % (job.pilotErrorDiag)) return job # note that this class should not be experiment specific, so move anything related to ATLAS to ATLASExperiment.py # and use thisExperiment.whatever() to retrieve it here # grab the number of events try: # nEvents_str can be a string of the form N|N|..|N with the number of jobs in the trf(s) [currently not used] # Add to Job class if necessary job.nEvents, job.nEventsW, nEvents_str = thisExperiment.getNumberOfEvents( job=job, number_of_jobs=len(runCommandList)) except Exception, e: tolog( "!!WARNING!!2999!! Failed to get number of events: %s (ignore)" % str(e))
def core_get_data(self, envsetup, token, source_surl, dest_path, experiment): """ stage-in core function, can be overridden (see stormSiteMover) """ error = PilotErrors() # determine which timeout option to use if self.isNewLCGVersion("%s lcg-cp" % (envsetup)): timeout_option = "--srm-timeout=%d --connect-timeout=300 --sendreceive-timeout=%d" % ( self.timeout, self.timeout) else: timeout_option = "-t %d" % (self.timeout) # used lcg-cp options: # --vo: specifies the Virtual Organization the user belongs to # -t: time-out if token: # do not use option -b on SL3 clusters running older versions of lcg_utils use_b = True s, o = commands.getstatusoutput("%s lcg-cr --version" % (envsetup)) if s != 0: # (BDII collects all information coming from site GIISes and stores them in a permanent database) tolog("(Probably too old lcg_utils - skipping BDII disabling)") use_b = False # for the time being use_b = False if use_b: _cmd_str = '%s lcg-cp --vo atlas --srcsetype srmv2 -s %s -b %s %s file://%s' %\ (envsetup, token, timeout_option, source_surl, dest_path) else: tolog("(Skipping space token for the time being)") _cmd_str = '%s lcg-cp --vo atlas %s %s file://%s' % ( envsetup, timeout_option, source_surl, dest_path) else: _cmd_str = '%s lcg-cp --vo atlas %s %s file://%s' % ( envsetup, timeout_option, source_surl, dest_path) # get the experiment object thisExperiment = getExperiment(experiment) # add the full stage-out command to the job setup script to_script = _cmd_str.replace("file://%s" % os.path.dirname(dest_path), "file://`pwd`") to_script = to_script.lstrip(' ') # remove any initial spaces if to_script.startswith('/'): to_script = 'source ' + to_script thisExperiment.updateJobSetupScript(os.path.dirname(dest_path), to_script=to_script) tolog("Executing command: %s" % (_cmd_str)) s = -1 o = '(not defined)' t0 = os.times() try: s, o = commands.getstatusoutput(_cmd_str) except Exception, e: tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" % (str(e), s, o)) o = str(e)
def setupHPCEvent(self): self.__jobSite = Site.Site() self.__jobSite.setSiteInfo(self.argumentParser()) ## For HPC job, we don't need to reassign the workdir # reassign workdir for this job self.__jobSite.workdir = self.__jobSite.wntmpdir if not os.path.exists(self.__jobSite.workdir): os.makedirs(self.__jobSite.workdir) tolog("runJobHPCEvent.getPilotLogFilename=%s"% self.getPilotLogFilename()) if self.getPilotLogFilename() != "": pUtil.setPilotlogFilename(self.getPilotLogFilename()) # set node info self.__node = Node.Node() self.__node.setNodeName(os.uname()[1]) self.__node.collectWNInfo(self.__jobSite.workdir) # redirect stderr #sys.stderr = open("%s/runJobHPCEvent.stderr" % (self.__jobSite.workdir), "w") tolog("Current job workdir is: %s" % os.getcwd()) tolog("Site workdir is: %s" % self.__jobSite.workdir) # get the experiment object self.__thisExperiment = getExperiment(self.getExperiment()) tolog("runEvent will serve experiment: %s" % (self.__thisExperiment.getExperiment()))
def verifySetupProxy(self, _setupStr, experiment): #check do we have a valid proxy # get the experiment object thisExperiment = getExperiment(experiment) status, output = thisExperiment.verifyProxy(envsetup=_setupStr) return status, output
def core_get_data(self, envsetup, token, source_surl, local_fullname, experiment): """ special get function developed for storm sites """ error = PilotErrors() # Transform the surl into a full surl full_se_endpoint = self.extractSE(readpar('se').split(",")[0])[1] prefix = os.path.commonprefix([source_surl, full_se_endpoint]) if prefix: # Can use the bdii-free form source_surl = full_se_endpoint + source_surl[len(prefix):] _cmd_str = '%s lcg-gt --nobdii --setype srmv2 "%s" file' % ( envsetup, source_surl) else: # Fallback solution, use old lcg-gt form # get the TURL using the SURL tolog( "!!WARNING!1234!! Source surl does not match %s, cannot use the bdii-independent lcg-gt" % full_se_endpoint) _cmd_str = '%s lcg-gt "%s" file' % (envsetup, source_surl) tolog("Executing command: %s" % (_cmd_str)) t0 = os.times() s, o = commands.getstatusoutput(_cmd_str) t1 = os.times() t = t1[4] - t0[4] tolog("Command finished after %f s" % (t)) if s == 0: # get the experiment object thisExperiment = getExperiment(experiment) # add the full stage-out command to the job setup script to_script = _cmd_str to_script = to_script.lstrip(' ') # remove any initial spaces if to_script.startswith('/'): to_script = 'source ' + to_script thisExperiment.updateJobSetupScript( os.path.dirname(local_fullname), to_script=to_script) source_turl, req_token = o.split('\n') source_turl = source_turl.replace('file://', '') tolog("Creating link from %s to %s" % (source_turl, local_fullname)) try: os.symlink(source_turl, local_fullname) _cmd_str = '%s lcg-sd %s %s 0' % (envsetup, source_surl, req_token) tolog("Executing command: %s" % (_cmd_str)) s, o = commands.getstatusoutput(_cmd_str) # Do we need to check the exit status of lcg-sd? What do we do if it fails? tolog("get_data succeeded") except Exception, e: pilotErrorDiag = "Exception caught: %s" % str(e) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) tolog("get_data failed") return error.ERR_STAGEINFAILED, pilotErrorDiag
def getOutFilesGuids(outFiles, workdir, experiment, TURL=False): """ get the outFilesGuids from the PFC """ ec = 0 pilotErrorDiag = "" outFilesGuids = [] # Get the experiment object and the GUID source filename thisExperiment = getExperiment(experiment) filename = thisExperiment.getGUIDSourceFilename() # If a source file should not be used (ie empty filename string), then generate the GUIDs here if filename == "": tolog("Pilot will generate GUIDs for the output files") for i in range(0, len(outFiles)): guid = getGUID() if guid == "": guid = "- GUID generation failed -" outFilesGuids.append(guid) return ec, pilotErrorDiag, outFilesGuids else: tolog("Pilot will get GUIDs for the output files from source: %s" % (filename)) pfcFile = os.path.join(workdir, filename) #"%s/PoolFileCatalog.xml" % (workdir) # The PFC used for Event Service will be TURL based, use the corresponding file if TURL: pfcFile = pfcFile.replace(".xml", "TURL.xml") # Initialization: make sure the guid list has the same length as the file list for i in range(0, len(outFiles)): outFilesGuids.append(None) # make sure the PFC exists if os.path.isfile(pfcFile): from xml.dom import minidom xmldoc = minidom.parse(pfcFile) fileList = xmldoc.getElementsByTagName("File") for thisfile in fileList: gpfn = str( thisfile.getElementsByTagName("pfn")[0].getAttribute("name")) guid = str(thisfile.getAttribute("ID")) for i in range(0, len(outFiles)): if outFiles[i] == gpfn: outFilesGuids[i] = guid else: pilotErrorDiag = "PFC file does not exist: %s" % (pfcFile) tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag)) error = PilotErrors() ec = error.ERR_MISSINGPFC return ec, pilotErrorDiag, outFilesGuids
def core_get_data(self, envsetup, token, source_surl, dest_path, experiment): """ stage-in core function, can be overridden (see stormSiteMover) """ error = PilotErrors() # determine which timeout option to use if self.isNewLCGVersion("%s lcg-cp" % (envsetup)): timeout_option = "--srm-timeout=%d --connect-timeout=300 --sendreceive-timeout=%d" % (self.timeout, self.timeout) else: timeout_option = "-t %d" % (self.timeout) # used lcg-cp options: # --vo: specifies the Virtual Organization the user belongs to # -t: time-out if token: # do not use option -b on SL3 clusters running older versions of lcg_utils use_b = True s, o = commands.getstatusoutput("%s lcg-cr --version" % (envsetup)) if s != 0: # (BDII collects all information coming from site GIISes and stores them in a permanent database) tolog("(Probably too old lcg_utils - skipping BDII disabling)") use_b = False # for the time being use_b = False if use_b: _cmd_str = '%s lcg-cp --vo atlas --srcsetype srmv2 -s %s -b %s %s file://%s' %\ (envsetup, token, timeout_option, source_surl, dest_path) else: tolog("(Skipping space token for the time being)") _cmd_str = '%s lcg-cp --vo atlas %s %s file://%s' % (envsetup, timeout_option, source_surl, dest_path) else: _cmd_str = '%s lcg-cp --vo atlas %s %s file://%s' % (envsetup, timeout_option, source_surl, dest_path) # get the experiment object thisExperiment = getExperiment(experiment) # add the full stage-out command to the job setup script to_script = _cmd_str.replace("file://%s" % os.path.dirname(dest_path), "file://`pwd`") to_script = to_script.lstrip(' ') # remove any initial spaces if to_script.startswith('/'): to_script = 'source ' + to_script thisExperiment.updateJobSetupScript(os.path.dirname(dest_path), to_script=to_script) tolog("Executing command: %s" % (_cmd_str)) s = -1 o = '(not defined)' t0 = os.times() try: s, o = commands.getstatusoutput(_cmd_str) except Exception, e: tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" % (str(e), s, o)) o = str(e)
def getOutFilesGuids(outFiles, workdir, experiment, TURL=False): """ get the outFilesGuids from the PFC """ ec = 0 pilotErrorDiag = "" outFilesGuids = [] # Get the experiment object and the GUID source filename thisExperiment = getExperiment(experiment) filename = thisExperiment.getGUIDSourceFilename() # If a source file should not be used (ie empty filename string), then generate the GUIDs here if filename == "": tolog("Pilot will generate GUIDs for the output files") for i in range (0, len(outFiles)): guid = getGUID() if guid == "": guid = "- GUID generation failed -" outFilesGuids.append(guid) return ec, pilotErrorDiag, outFilesGuids else: tolog("Pilot will get GUIDs for the output files from source: %s" % (filename)) pfcFile = os.path.join(workdir, filename) #"%s/PoolFileCatalog.xml" % (workdir) # The PFC used for Event Service will be TURL based, use the corresponding file if TURL: pfcFile = pfcFile.replace(".xml", "TURL.xml") # Initialization: make sure the guid list has the same length as the file list for i in range (0, len(outFiles)): outFilesGuids.append(None) # make sure the PFC exists if os.path.isfile(pfcFile): from xml.dom import minidom xmldoc = minidom.parse(pfcFile) fileList = xmldoc.getElementsByTagName("File") for thisfile in fileList: gpfn = str(thisfile.getElementsByTagName("pfn")[0].getAttribute("name")) guid = str(thisfile.getAttribute("ID")) for i in range(0, len(outFiles)): if outFiles[i] == gpfn: outFilesGuids[i] = guid else: pilotErrorDiag = "PFC file does not exist: %s" % (pfcFile) tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag)) error = PilotErrors() ec = error.ERR_MISSINGPFC return ec, pilotErrorDiag, outFilesGuids
def core_get_data(self, envsetup, token, source_surl, local_fullname, experiment): """ special get function developed for storm sites """ error = PilotErrors() # Transform the surl into a full surl full_se_endpoint = self.extractSE(readpar('se').split(",")[0])[1] prefix = os.path.commonprefix([source_surl, full_se_endpoint]) if prefix: # Can use the bdii-free form source_surl = full_se_endpoint + source_surl[len(prefix):] _cmd_str = '%s lcg-gt --nobdii --setype srmv2 "%s" file' % (envsetup, source_surl) else: # Fallback solution, use old lcg-gt form # get the TURL using the SURL tolog("!!WARNING!1234!! Source surl does not match %s, cannot use the bdii-independent lcg-gt" % full_se_endpoint) _cmd_str = '%s lcg-gt "%s" file' % (envsetup, source_surl) tolog("Executing command: %s" % (_cmd_str)) t0 = os.times() s, o = commands.getstatusoutput(_cmd_str) t1 = os.times() t = t1[4] - t0[4] tolog("Command finished after %f s" % (t)) if s == 0: # get the experiment object thisExperiment = getExperiment(experiment) # add the full stage-out command to the job setup script to_script = _cmd_str to_script = to_script.lstrip(' ') # remove any initial spaces if to_script.startswith('/'): to_script = 'source ' + to_script thisExperiment.updateJobSetupScript(os.path.dirname(local_fullname), to_script=to_script) source_turl, req_token = o.split('\n') source_turl = source_turl.replace('file://','') tolog("Creating link from %s to %s" % (source_turl, local_fullname)) try: os.symlink(source_turl, local_fullname) _cmd_str = '%s lcg-sd %s %s 0' % (envsetup, source_surl, req_token) tolog("Executing command: %s" % (_cmd_str)) s,o = commands.getstatusoutput(_cmd_str) # Do we need to check the exit status of lcg-sd? What do we do if it fails? tolog("get_data succeeded") except Exception, e: pilotErrorDiag = "Exception caught: %s" % str(e) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) tolog("get_data failed") return error.ERR_STAGEINFAILED, pilotErrorDiag
def setup(self, experiment): """ setup env """ if self.__isSetuped: return 0, None self.__experiment = experiment thisExperiment = getExperiment(experiment) self.useTracingService = thisExperiment.useTracingService() si = getSiteInformation(experiment) self._defaultSetup = self.getLocalROOTSetup(si) _setupStr = self._defaultSetup #self.getSetup() # get the user proxy if available envsetupTest = _setupStr.strip() if envsetupTest != "" and not envsetupTest.endswith(';'): envsetupTest += ";" if os.environ.has_key('X509_USER_PROXY'): envsetupTest += " export X509_USER_PROXY=%s;" % ( os.environ['X509_USER_PROXY']) self.log("to verify site setup: %s " % envsetupTest) status, output = self.verifySetup(envsetupTest, experiment) self.log("site setup verifying: status: %s, output: %s" % (status, output["errorLog"])) if status == 0: self._setup = envsetupTest self.__isSetuped = True return status, output else: if self._defaultSetup: #try to use default setup self.log("Try to use default envsetup") envsetupTest = self._defaultSetup.strip() if envsetupTest != "" and not envsetupTest.endswith(';'): envsetupTest += ";" if os.environ.has_key('X509_USER_PROXY'): envsetupTest += " export X509_USER_PROXY=%s;" % ( os.environ['X509_USER_PROXY']) self.log("verify default setup: %s " % envsetupTest) status, output = self.verifySetup(envsetupTest, experiment) self.log("default setup verifying: status: %s, output: %s" % (status, output["errorLog"])) if status == 0: self._setup = envsetupTest self.__isSetuped = True return status, output return status, output
def interpretPayload(self, job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, failureCode): """ Interpret the payload, look for specific errors in the stdout """ # get the experiment object thisExperiment = getExperiment(job.experiment) if not thisExperiment: job.pilotErrorDiag = "ErrorDiagnosis did not get an experiment object from the factory" job.result[2] = self.__error.ERR_GENERALERROR # change to better/new error code tolog("!!WARNING!!3234!! %s" % (job.pilotErrorDiag)) return job ### WARNING: EXPERIMENT SPECIFIC, MOVE LATER try: ec, pilotErrorDiag = self.processJobReport(job.workdir) except Exception, e: tolog("!!WARNING!!1114!! Caught exception: %s" % (e))
def getUtilityInfo(self, node, experiment, workdir): """ Add the utility info to the node structure if available """ # Get the experiment object and check if the special utility (e.g. a memory monitor) was used thisExperiment = getExperiment(experiment) if thisExperiment.shouldExecuteUtility(): # Try to get the memory monitor info from the workdir first path = os.path.join(workdir, thisExperiment.getUtilityJSONFilename()) init_path = os.path.join(self.__pilot_initdir, thisExperiment.getUtilityJSONFilename()) if not os.path.exists(path): tolog("File does not exist: %s" % (path)) if os.path.exists(init_path): path = init_path else: tolog("File does not exist either: %s" % (path)) path = "" if path != "": tolog("Reading memory monitoring info from: %s" % (path)) # Get the dictionary d = getJSONDictionary(path) if d and d != {}: try: node['maxRSS'] = d['Max']['maxRSS'] node['maxVMEM'] = d['Max']['maxVMEM'] node['maxSWAP'] = d['Max']['maxSwap'] node['maxPSS'] = d['Max']['maxPSS'] node['avgRSS'] = d['Avg']['avgRSS'] node['avgVMEM'] = d['Avg']['avgVMEM'] node['avgSWAP'] = d['Avg']['avgSwap'] node['avgPSS'] = d['Avg']['avgPSS'] except Exception, e: tolog("!!WARNING!!54541! Exception caught while parsing memory monitor JSON: %s" % (e)) else: tolog("Extracted info from memory monitor JSON") # Done with the memory monitor for this job (if the file is read from the pilots' init dir), remove the file in case there are other jobs to be run if os.path.exists(init_path): try: os.system("rm -rf %s" % (init_path)) except Exception, e: tolog("!!WARNING!!4343!! Failed to remove %s: %s" % (init_path), e) else: tolog("Removed %s" % (init_path))
def setup(self, experiment): """ setup env """ if self.__isSetuped: return 0, None self.__experiment = experiment thisExperiment = getExperiment(experiment) self.useTracingService = thisExperiment.useTracingService() si = getSiteInformation(experiment) self._defaultSetup = self.getLocalROOTSetup(si) _setupStr = self._defaultSetup #self.getSetup() # get the user proxy if available envsetupTest = _setupStr.strip() if envsetupTest != "" and not envsetupTest.endswith(';'): envsetupTest += ";" if os.environ.has_key('X509_USER_PROXY'): envsetupTest += " export X509_USER_PROXY=%s;" % (os.environ['X509_USER_PROXY']) self.log("to verify site setup: %s " % envsetupTest) status, output = self.verifySetup(envsetupTest, experiment) self.log("site setup verifying: status: %s, output: %s" % (status, output["errorLog"])) if status == 0: self._setup = envsetupTest self.__isSetuped = True return status, output else: if self._defaultSetup: #try to use default setup self.log("Try to use default envsetup") envsetupTest = self._defaultSetup.strip() if envsetupTest != "" and not envsetupTest.endswith(';'): envsetupTest += ";" if os.environ.has_key('X509_USER_PROXY'): envsetupTest += " export X509_USER_PROXY=%s;" % (os.environ['X509_USER_PROXY']) self.log("verify default setup: %s " % envsetupTest) status, output = self.verifySetup(envsetupTest, experiment) self.log("default setup verifying: status: %s, output: %s" % (status, output["errorLog"])) if status == 0: self._setup = envsetupTest self.__isSetuped = True return status, output return status, output
def getJobMetrics(self, job, workerNode): """ Return a properly formatted job metrics string """ # style: Number of events read | Number of events written | vmPeak maximum | vmPeak average | RSS average | JEM activation # format: nEvents=<int> nEventsW=<int> vmPeakMax=<int> vmPeakMean=<int> RSSMean=<int> JEM=<string> # hs06=<float> shutdownTime=<int> cpuFactor=<float> cpuLimit=<float> diskLimit=<float> jobStart=<int> memLimit=<int> runLimit=<float> # get the experiment object thisExperiment = getExperiment(job.experiment) if job.coreCount: # Always use the ATHENA_PROC_NUMBER first, if set if os.environ.has_key('ATHENA_PROC_NUMBER'): try: job.coreCount = int(os.environ['ATHENA_PROC_NUMBER']) except Exception, e: tolog("ATHENA_PROC_NUMBER is not properly set: %s (will use existing job.coreCount value)" % (e)) coreCount = job.coreCount
def extractJobInformation(self, job, runCommandList): """ Extract relevant job information, e.g. number of events """ # get the experiment object thisExperiment = getExperiment(job.experiment) if not thisExperiment: job.pilotErrorDiag = "ErrorDiagnosis did not get an experiment object from the factory" job.result[2] = self.__error.ERR_GENERALERROR # change to better/new error code tolog("!!WARNING!!3234!! %s" % (job.pilotErrorDiag)) return job # note that this class should not be experiment specific, so move anything related to ATLAS to ATLASExperiment.py # and use thisExperiment.whatever() to retrieve it here # grab the number of events try: # nEvents_str can be a string of the form N|N|..|N with the number of jobs in the trf(s) [currently not used] # Add to Job class if necessary job.nEvents, job.nEventsW, nEvents_str = thisExperiment.getNumberOfEvents(job=job, number_of_jobs=len(runCommandList)) except Exception, e: tolog("!!WARNING!!2999!! Failed to get number of events: %s (ignore)" % str(e))
if runJob.getPilotLogFilename() != "": pUtil.setPilotlogFilename(runJob.getPilotLogFilename()) # set node info node = Node.Node() node.setNodeName(os.uname()[1]) node.collectWNInfo(jobSite.workdir) # redirect stder sys.stderr = open("%s/runjob.stderr" % (jobSite.workdir), "w") tolog("Current job workdir is: %s" % os.getcwd()) tolog("Site workdir is: %s" % jobSite.workdir) # get the experiment object thisExperiment = getExperiment(runJob.getExperiment()) tolog("RunJob will serve experiment: %s" % (thisExperiment.getExperiment())) # set the cache (used e.g. by LSST) #if runJob.getCache(): # thisExperiment.setCache(runJob.getCache()) #JR = JobRecovery() try: job = Job.Job() job.setJobDef(newJobDef.job) job.workdir = jobSite.workdir job.experiment = runJob.getExperiment() # figure out and set payload file names
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ copy input file from SE to local dir """ error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict jobId = pdict.get('jobId', '') workDir = pdict.get('workDir', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) # try to get the direct reading control variable (False for direct reading mode; file should not be copied) useCT = pdict.get('usect', True) prodDBlockToken = pdict.get('access', '') # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 'lcg2', lfn, guid) # get a proper envsetup envsetup = self.getEnvsetup(get=True) ec, pilotErrorDiag = verifySetupCommand(error, envsetup) if ec != 0: self.prepareReport('RFCP_FAIL', report) return ec, pilotErrorDiag # get the experiment object thisExperiment = getExperiment(experiment) if proxycheck: # do we have a valid proxy? s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup) if s != 0: self.prepareReport('PROXYFAIL', report) return s, pilotErrorDiag else: tolog("Proxy verification turned off") getfile = gpfn if path == '': path = './' fullname = os.path.join(path, lfn) # can not test filesize and checksum if remote values are not known if fsize == 0 or fchecksum == 0: tolog("!!WARNING!!2999!! Remote file size/checksum not known: %s/%s" % (fsize, fchecksum)) # Maybe be a comma list but take first always # (Remember that se can be a list where the first is used for output but any can be used for input) se = readpar('se').split(",")[0] _dummytoken, se = self.extractSE(se) tolog("Using SE: %s" % (se)) # se = srm://head01.aglt2.org:8443/srm/managerv2?SFN= # for srm protocol, use the full info from 'se' if getfile[:3] == "srm": try: # e.g. tmp = ['srm:', '', 'head01.aglt2.org', 'pnfs/aglt2.org/rucio/panda/dis/08/...'] tmp = getfile.split('/',3)[2] except Exception, e: tolog('!!WARNING!!2999!! Could not extract srm protocol for replacement, keeping getfile variable as it is: %s (%s)' %\ (getfile, str(e))) else: # replace srm with 'srm://head01.aglt2.org:8443/srm/managerv2?SFN=' if not there already if not '?SFN=' in getfile: # srm = 'srm://head01.aglt2.org' srm = 'srm://' + tmp # does seopt contain any matching srm's? sematch = self.getSEMatchFromSEOpt(srm) if sematch != "": getfile = getfile.replace(srm, sematch) tolog("Replaced %s with %s (from seopt) in getfile: %s" % (srm, sematch, getfile)) else: getfile = getfile.replace(srm, se) tolog("Replaced %s with %s (from se) in getfile: %s" % (srm, se, getfile)) else: tolog("Found SFN part in getfile: %s" % (getfile)) # add port number from se to getfile if necessary getfile = self.addPortToPath(se, getfile)
def getFileSystemRootPath(experiment): """ Return the proper file system root path (cvmfs) """ e = getExperiment(experiment) return e.getCVMFSPath()
if pilotlogfilename != "": pUtil.setPilotlogFilename(pilotlogfilename) # set node info node = Node.Node() node.setNodeName(os.uname()[1]) node.collectWNInfo(jobSite.workdir) # redirect stder sys.stderr = open("%s/runjob.stderr" % (jobSite.workdir), "w") tolog("Current job workdir is: %s" % os.getcwd()) tolog("Site workdir is: %s" % jobSite.workdir) # get the experiment object thisExperiment = getExperiment(experiment) tolog("runJob will serve experiment: %s" % (thisExperiment.getExperiment())) region = readpar('region') JR = JobRecovery() try: job = Job.Job() job.setJobDef(newJobDef.job) job.workdir = jobSite.workdir job.experiment = experiment # figure out and set payload file names job.setPayloadName(thisExperiment.getPayloadName(job)) except Exception, e: pilotErrorDiag = "Failed to process job info: %s" % str(e) tolog("!!WARNING!!3000!! %s" % (pilotErrorDiag)) failJob(0, error.ERR_UNKNOWN, job, pilotserver, pilotport, pilotErrorDiag=pilotErrorDiag)
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ copy output file from disk to local SE """ # function is based on dCacheSiteMover put function error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') token = pdict.get('token', '') scope = pdict.get('scope', '') dsname = pdict.get('dsname', '') analysisJob = pdict.get('analJob', False) testLevel = pdict.get('testLevel', '0') extradirs = pdict.get('extradirs', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) prodSourceLabel = pdict.get('prodSourceLabel', '') # get the site information object si = getSiteInformation(experiment) tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel)) if prodSourceLabel == 'ddm' and analysisJob: tolog("Treating PanDA Mover job as a production job during stage-out") analysisJob = False # get the Rucio tracing report try: report = pdict['report'] except: report = {} else: # set the proper protocol report['protocol'] = 'curl' # mark the relative start report['catStart'] = time() # the current file report['filename'] = lfn # guid report['guid'] = guid.replace('-','') # preparing variables if fsize == 0 or fchecksum == 0: ec, pilotErrorDiag, fsize, fchecksum = self.getLocalFileInfo(source, csumtype="adler32") if ec != 0: self.prepareReport('LOCAL_FILE_INFO_FAIL', report) return self.put_data_retfail(ec, pilotErrorDiag) # now that the file size is known, add it to the tracing report report['filesize'] = fsize # get the checksum type if fchecksum != 0 and fchecksum != "": csumtype = self.getChecksumType(fchecksum) else: csumtype = "default" # get a proper envsetup envsetup = self.getEnvsetup() # get the experiment object thisExperiment = getExperiment(experiment) if proxycheck: s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup, limit=2) if s != 0: self.prepareReport('NO_PROXY', report) return self.put_data_retfail(error.ERR_NOPROXY, pilotErrorDiag) else: tolog("Proxy verification turned off") filename = os.path.basename(source) # get all the proper paths ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths(error, analysisJob, token, prodSourceLabel, dsname, filename, scope=scope, sitemover=self) # quick workaround if ec != 0: self.prepareReport(tracer_error, report) return self.put_data_retfail(ec, pilotErrorDiag) putfile = surl full_surl = putfile if full_surl[:len('token:')] == 'token:': # remove the space token (e.g. at Taiwan-LCG2) from the SURL info full_surl = full_surl[full_surl.index('srm://'):] # srm://dcache01.tier2.hep.manchester.ac.uk/pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/ #testpanda.destDB/testpanda.destDB.604b4fbc-dbe9-4b05-96bb-6beee0b99dee_sub0974647/ #86ecb30d-7baa-49a8-9128-107cbfe4dd90_0.job.log.tgz tolog("putfile: %s" % (putfile)) tolog("full_surl: %s" % (full_surl)) # get https surl full_http_surl = full_surl.replace("srm://", "https://") # get the RSE from ToA try: _RSE = self.getRSE(surl=putfile) except Exception, e: tolog("Warning: Failed to get RSE: %s (can not add this info to tracing report)" % str(e))
def interpretPayloadStdout(self, job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, failureCode): """ payload error handling """ # NOTE: Move away ATLAS specific info in this method, e.g. vmPeak stuff error = PilotErrors() #Mancinelli: moved it in experiment class method handleTrfExitcode #transExitCode = res[0]%255 tolog("Mancinellidebug: res = %s res[0] = %s" % (res, res[0])) # Get the proper stdout filename number_of_jobs = len(runCommandList) filename = getStdoutFilename(job.workdir, job.stdout, current_job_number, number_of_jobs) # get the experiment object thisExperiment = getExperiment(job.experiment) if not thisExperiment: job.pilotErrorDiag = "ErrorDiagnosis did not get an experiment object from the factory" job.result[2] = error.ERR_GENERALERROR # change to better/new error code tolog("!!WARNING!!3334!! %s" % (job.pilotErrorDiag)) return job # Try to identify out of memory errors in the stderr out_of_memory = thisExperiment.isOutOfMemory(job=job, number_of_jobs=number_of_jobs) failed = out_of_memory # failed boolean used below # Always look for the max and average VmPeak? setup = getSourceSetup(runCommandList[0]) job.vmPeakMax, job.vmPeakMean, job.RSSMean = findVmPeaks(setup) # A killed job can have empty output but still transExitCode == 0 no_payload_output = False installation_error = False if getstatusoutput_was_interrupted: if os.path.exists(filename): if os.path.getsize(filename) > 0: tolog("Payload produced stdout but was interrupted (getstatusoutput threw an exception)") else: no_payload_output = True failed = True else: failed = True no_payload_output = True elif len(res[1]) < 20: # protect the following comparison against massive outputs if res[1] == 'Undefined': failed = True no_payload_output = True elif failureCode: failed = True else: # check for installation error res_tmp = res[1][:1024] if res_tmp[0:3] == "sh:" and 'setup.sh' in res_tmp and 'No such file or directory' in res_tmp: failed = True installation_error = True if res[0] or failed: #Mancinelli: all this common part with CMS? if failureCode: job.pilotErrorDiag = "Payload failed: Interrupt failure code: %d" % (failureCode) # (do not set pilot error code) elif getstatusoutput_was_interrupted: raise Exception, "Job execution was interrupted (see stderr)" elif out_of_memory: job.pilotErrorDiag = "Payload ran out of memory" job.result[2] = error.ERR_ATHENAOUTOFMEMORY elif no_payload_output: job.pilotErrorDiag = "Payload failed: No output" job.result[2] = error.ERR_NOPAYLOADOUTPUT elif installation_error: job.pilotErrorDiag = "Payload failed: Missing installation" job.result[2] = error.ERR_MISSINGINSTALLATION elif res[0]: #Mancinelli: calling for experiment class method to manage transformation exit code job = thisExperiment.handleTrfExitcode(job, res, error, filename) else: job.pilotErrorDiag = "Payload failed due to unknown reason (check payload stdout)" job.result[2] = error.ERR_UNKNOWN tolog("!!FAILED!!3000!! %s" % (job.pilotErrorDiag)) # note: several errors below are atlas specific (not all), should be handled through ATLASExperiment via thisExperiment object # move entire section below to ATLASExperiment, define prototype [empty] methods in Experiment and OtherExperiment classes, implement in ATLASExperiment # non experiment specific errors should be handled here (e.g. no_payload_output) # handle non-zero failed job return code but do not set pilot error codes to all payload errors """ if transExitCode or failed: if failureCode: job.pilotErrorDiag = "Payload failed: Interrupt failure code: %d" % (failureCode) # (do not set pilot error code) elif getstatusoutput_was_interrupted: raise Exception, "Job execution was interrupted (see stderr)" elif out_of_memory: job.pilotErrorDiag = "Payload ran out of memory" job.result[2] = error.ERR_ATHENAOUTOFMEMORY elif no_payload_output: job.pilotErrorDiag = "Payload failed: No output" job.result[2] = error.ERR_NOPAYLOADOUTPUT elif installation_error: job.pilotErrorDiag = "Payload failed: Missing installation" job.result[2] = error.ERR_MISSINGINSTALLATION elif transExitCode: # Handle PandaMover errors if transExitCode == 176: job.pilotErrorDiag = "PandaMover staging error: File is not cached" job.result[2] = error.ERR_PANDAMOVERFILENOTCACHED elif transExitCode == 86: job.pilotErrorDiag = "PandaMover transfer failure" job.result[2] = error.ERR_PANDAMOVERTRANSFER else: # check for specific errors in athena stdout if os.path.exists(filename): e1 = "prepare 5 database is locked" e2 = "Error SQLiteStatement" _out = commands.getoutput('grep "%s" %s | grep "%s"' % (e1, filename, e2)) if 'sqlite' in _out: job.pilotErrorDiag = "NFS/SQLite locking problems: %s" % (_out) job.result[2] = error.ERR_NFSSQLITE else: job.pilotErrorDiag = "Job failed: Non-zero failed job return code: %d" % (transExitCode) # (do not set a pilot error code) else: job.pilotErrorDiag = "Job failed: Non-zero failed job return code: %d (%s does not exist)" % (transExitCode, filename) # (do not set a pilot error code) else: job.pilotErrorDiag = "Payload failed due to unknown reason (check payload stdout)" job.result[2] = error.ERR_UNKNOWN tolog("!!FAILED!!3000!! %s" % (job.pilotErrorDiag)) # set the trf diag error if res[2] != "": tolog("TRF diagnostics: %s" % (res[2])) job.exeErrorDiag = res[2] job.result[1] = transExitCode """ return job
if runJob.getPilotLogFilename() != "": pUtil.setPilotlogFilename(runJob.getPilotLogFilename()) # set node info node = Node.Node() node.setNodeName(os.uname()[1]) node.collectWNInfo(jobSite.workdir) # redirect stder sys.stderr = open("%s/runjob.stderr" % (jobSite.workdir), "w") tolog("Current job workdir is: %s" % os.getcwd()) tolog("Site workdir is: %s" % jobSite.workdir) # get the experiment object thisExperiment = getExperiment(runJob.getExperiment()) tolog("RunJob will serve experiment: %s" % (thisExperiment.getExperiment())) # set the cache (used e.g. by LSST) #if runJob.getCache(): # thisExperiment.setCache(runJob.getCache()) #JR = JobRecovery() try: job = Job.Job() job.setJobDef(newJobDef.job) job.workdir = jobSite.workdir job.experiment = runJob.getExperiment() # figure out and set payload file names job.setPayloadName(thisExperiment.getPayloadName(job))
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ copy input file from SE to local dir """ error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict jobId = pdict.get('jobId', '') workDir = pdict.get('workDir', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) # try to get the direct reading control variable (False for direct reading mode; file should not be copied) useCT = pdict.get('usect', True) prodDBlockToken = pdict.get('access', '') # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 'lcg2', lfn, guid) # get a proper envsetup envsetup = self.getEnvsetup(get=True) ec, pilotErrorDiag = verifySetupCommand(error, envsetup) if ec != 0: self.prepareReport('RFCP_FAIL', report) return ec, pilotErrorDiag # get the experiment object thisExperiment = getExperiment(experiment) if proxycheck: # do we have a valid proxy? s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup) if s != 0: self.prepareReport('PROXYFAIL', report) return s, pilotErrorDiag else: tolog("Proxy verification turned off") getfile = gpfn if path == '': path = './' fullname = os.path.join(path, lfn) # can not test filesize and checksum if remote values are not known if fsize == 0 or fchecksum == 0: tolog( "!!WARNING!!2999!! Remote file size/checksum not known: %s/%s" % (fsize, fchecksum)) # Maybe be a comma list but take first always # (Remember that se can be a list where the first is used for output but any can be used for input) se = readpar('se').split(",")[0] _dummytoken, se = self.extractSE(se) tolog("Using SE: %s" % (se)) # se = srm://head01.aglt2.org:8443/srm/managerv2?SFN= # for srm protocol, use the full info from 'se' if getfile[:3] == "srm": try: # e.g. tmp = ['srm:', '', 'head01.aglt2.org', 'pnfs/aglt2.org/rucio/panda/dis/08/...'] tmp = getfile.split('/', 3)[2] except Exception, e: tolog('!!WARNING!!2999!! Could not extract srm protocol for replacement, keeping getfile variable as it is: %s (%s)' %\ (getfile, str(e))) else: # replace srm with 'srm://head01.aglt2.org:8443/srm/managerv2?SFN=' if not there already if not '?SFN=' in getfile: # srm = 'srm://head01.aglt2.org' srm = 'srm://' + tmp # does seopt contain any matching srm's? sematch = self.getSEMatchFromSEOpt(srm) if sematch != "": getfile = getfile.replace(srm, sematch) tolog( "Replaced %s with %s (from seopt) in getfile: %s" % (srm, sematch, getfile)) else: getfile = getfile.replace(srm, se) tolog("Replaced %s with %s (from se) in getfile: %s" % (srm, se, getfile)) else: tolog("Found SFN part in getfile: %s" % (getfile)) # add port number from se to getfile if necessary getfile = self.addPortToPath(se, getfile)
def put_data(self, pfn, destination, fsize=0, fchecksum=0, dsname='', extradirs='', **pdict): """ copy output file from disk to local SE """ error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') token = pdict.get('token', '') scope = pdict.get('scope', '') logFile = pdict.get('logFile', '') sitename = pdict.get('sitename', '') proxycheck = pdict.get('proxycheck', False) experiment = pdict.get('experiment', '') analysisJob = pdict.get('analJob', False) prodSourceLabel = pdict.get('prodSourceLabel', '') # get the site information object si = getSiteInformation(experiment) if prodSourceLabel == 'ddm' and analysisJob: tolog("Treating PanDA Mover job as a production job during stage-out") analysisJob = False filename = pfn.split('/')[-1] # get the DQ2 tracing report report = self.getStubTracingReport(pdict['report'], 'lcg', lfn, guid) # is the dataset defined? if dsname == '': pilotErrorDiag = "Dataset name not specified to put_data" tolog('!!WARNING!!2990!! %s' % (pilotErrorDiag)) self.prepareReport('DSN_UNDEF', report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag) # preparing variables if fsize == 0 or fchecksum == 0: ec, pilotErrorDiag, fsize, fchecksum = self.getLocalFileInfo(pfn, csumtype="adler32") if ec != 0: self.prepareReport('LOCAL_FILE_INFO_FAIL', report) return self.put_data_retfail(ec, pilotErrorDiag) # now that the file size is known, add it to the tracing report report['filesize'] = fsize # get a proper envsetup envsetup = self.getEnvsetup() ec, pilotErrorDiag = verifySetupCommand(error, envsetup) if ec != 0: self.prepareReport('RFCP_FAIL', report) return self.put_data_retfail(ec, pilotErrorDiag) # get the experiment object thisExperiment = getExperiment(experiment) # do we need to check the user proxy? if proxycheck: s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup, limit=2) if s != 0: self.prepareReport('PROXY_FAIL', report) return self.put_data_retfail(error.ERR_NOPROXY, pilotErrorDiag) else: tolog("Proxy verification turned off") # get all the proper paths ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths(error, analysisJob, token, prodSourceLabel, dsname, filename, scope=scope, sitemover=self) # quick workaround if ec != 0: self.prepareReport(tracer_error, report) return self.put_data_retfail(ec, pilotErrorDiag, surl=dst_gpfn) lfclfn = os.path.join(lfcdir, lfn) # LFC LFN = /grid/atlas/dq2/testpanda/testpanda.destDB.dfb45803-1251-43bb-8e7a-6ad2b6f205be_sub01000492/ #364aeb74-8b62-4c8f-af43-47b447192ced_0.job.log.tgz # putfile is the SURL putfile = surl full_surl = putfile if full_surl[:len('token:')] == 'token:': # remove the space token (e.g. at Taiwan-LCG2) from the SURL info full_surl = full_surl[full_surl.index('srm://'):] # srm://dcache01.tier2.hep.manchester.ac.uk/pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/ #testpanda.destDB/testpanda.destDB.604b4fbc-dbe9-4b05-96bb-6beee0b99dee_sub0974647/ #86ecb30d-7baa-49a8-9128-107cbfe4dd90_0.job.log.tgz tolog("putfile = %s" % (putfile)) tolog("full_surl = %s" % (full_surl)) # get the DQ2 site name from ToA try: _dq2SiteName = self.getDQ2SiteName(surl=putfile) except: # WARNING: do not print the exception here since it can sometimes not be converted to a string! (problem seen at Taiwan) tolog("Warning: Failed to get the DQ2 site name (can not add this info to tracing report)") else: report['localSite'], report['remoteSite'] = (_dq2SiteName, _dq2SiteName) tolog("DQ2 site name: %s" % (_dq2SiteName)) # get the absolute (full) path to the file fppfn = os.path.abspath(pfn) tolog("pfn=%s" % (pfn)) cmd = '%s echo "LFC_HOST=$LFC_HOST"; lfc-mkdir -p %s' % (envsetup, lfcdir) # export LFC_HOST=lfc0448.gridpp.rl.ac.uk ; echo "LFC_HOST=$LFC_HOST"; #lfc-mkdir -p /grid/atlas/dq2/testpanda.destDB/testpanda.destDB.604b4fbc-dbe9-4b05-96bb-6beee0b99dee_sub0974647 tolog("Executing command: %s" % (cmd)) s, o = commands.getstatusoutput(cmd) if s == 0: tolog("LFC setup and mkdir succeeded") tolog("Command output: %s" % (o)) else: tolog("!!WARNING!!2990!! LFC setup and mkdir failed. Status=%s Output=%s" % (s, o)) if o == "Could not establish context": pilotErrorDiag = "Could not establish context: Proxy / VO extension of proxy has probably expired" tolog("!!WARNING!!2990!! %s" % (pilotErrorDiag)) self.dumpExtendedProxy(envsetup) self.prepareReport('CONTEXT_FAIL', report) return self.put_data_retfail(error.ERR_NOPROXY, pilotErrorDiag, surl=full_surl) else: pilotErrorDiag = "LFC setup and mkdir failed: %s" % (o) self.prepareReport('LFC_SETUP_FAIL', report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag, surl=full_surl) # determine which timeout option to use if self.isNewLCGVersion("%s lcg-cr" % (envsetup)): timeout_option = "--srm-timeout=%d --connect-timeout=300 --sendreceive-timeout=%d" % (self.timeout, self.timeout) else: timeout_option = "-t %d" % (self.timeout) # used lcg-cr options: # --verbose: verbosity on # --vo: specifies the Virtual Organization the user belongs to # -T: specify SRM version # -s: space token description # -b: BDII disabling # -t: time-out # -l: specifies the Logical File Name associated with the file. If this option is present, an entry is added to the LFC # -g: specifies the Grid Unique IDentifier. If this option is not present, a GUID is generated internally # -d: specifies the destination. It can be the Storage Element fully qualified hostname or an SURL. In the latter case, # the scheme can be sfn: for a classical SE or srm:. If only the fully qualified hostname is given, a filename is # generated in the same format as with the Replica Manager if token: # Special case for GROUPDISK (do not remove dst: bit before this stage, needed in several places) if "dst:" in token: token = token[len('dst:'):] tolog("Dropped dst: part of space token descriptor; token=%s" % (token)) token = "ATLASGROUPDISK" tolog("Space token descriptor reset to: %s" % (token)) surl = putfile[putfile.index('srm://'):] _cmd_str = '%s which lcg-cr; lcg-cr --version; lcg-cr --verbose --vo atlas -T srmv2 -s %s -b %s -l %s -g %s -d %s file:%s' % (envsetup, token, timeout_option, lfclfn, guid, surl, fppfn) else: surl = putfile _cmd_str = '%s which lcg-cr; lcg-cr --version; lcg-cr --verbose --vo atlas %s -l %s -g %s -d %s file:%s' % (envsetup, timeout_option, lfclfn, guid, surl, fppfn) # GoeGrid testing: _cmd_str = '%s which lcg-cr; lcg-cr --version; lcg-crXXX --verbose --vo atlas %s -l %s -g %s -d %s file:%s' % (envsetup, timeout_option, lfclfn, guid, surl, fppfn) tolog("Executing command: %s" % (_cmd_str)) s = -1 t0 = os.times() report['relativeStart'] = time() report['transferStart'] = time() try: s, o = commands.getstatusoutput(_cmd_str) except Exception, e: tolog("!!WARNING!!2990!! Exception caught: %s" % (str(e))) o = str(e)
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ copy input file from SE to local dir """ error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict token = pdict.get('token', None) jobId = pdict.get('jobId', '') workDir = pdict.get('workDir', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) # try to get the direct reading control variable (False for direct reading mode; file should not be copied) useCT = pdict.get('usect', True) prodDBlockToken = pdict.get('access', '') # get the DQ2 tracing report report = self.getStubTracingReport(pdict['report'], 'lcg', lfn, guid) # get a proper envsetup envsetup = self.getEnvsetup(get=True) ec, pilotErrorDiag = verifySetupCommand(error, envsetup) if ec != 0: self.prepareReport('RFCP_FAIL', report) return ec, pilotErrorDiag # get the experiment object thisExperiment = getExperiment(experiment) if proxycheck: # do we have a valid proxy? s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup) if s != 0: self.prepareReport('PROXYFAIL', report) return s, pilotErrorDiag else: tolog("Proxy verification turned off") getfile = gpfn if path == '': path = './' fullname = os.path.join(path, lfn) # should the root file be copied or read directly by athena? directIn, useFileStager = self.getTransferModes() if directIn: if useCT: directIn = False tolog("Direct access mode is switched off (file will be transferred with the copy tool)") updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", type="input") else: # determine if the file is a root file according to its name rootFile = self.isRootFileName(lfn) if prodDBlockToken == 'local' or not rootFile: directIn = False tolog("Direct access mode has been switched off for this file (will be transferred with the copy tool)") updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", type="input") elif rootFile: tolog("Found root file according to file name: %s (will not be transferred in direct reading mode)" % (lfn)) report['relativeStart'] = None report['transferStart'] = None self.prepareReport('FOUND_ROOT', report) if useFileStager: updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="file_stager", type="input") else: updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="remote_io", type="input") return error.ERR_DIRECTIOFILE, pilotErrorDiag else: tolog("Normal file transfer") # get remote filesize and checksum if fsize == 0 or fchecksum == 0: try: import lfc except Exception, e: pilotErrorDiag = "get_data() could not import lfc module: %s" % str(e) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) self.prepareReport('LFC_IMPORT', report) return error.ERR_GETLFCIMPORT, pilotErrorDiag os.environ['LFC_HOST'] = readpar('lfchost') try: ret, res = lfc.lfc_getreplicas([str(guid)],"") except Exception, e: pilotErrorDiag = "Failed to get LFC replicas: %s" % str(e) tolog("!!WARNING!!2990!! Exception caught: %s" % (pilotErrorDiag)) tolog("Mover get_data finished (failed)") self.prepareReport('NO_LFC_REPS', report) return error.ERR_FAILEDLFCGETREPS, pilotErrorDiag
node['cpuConsumptionUnit'] = '?' node['cpuConversionFactor'] = job.cpuConversionFactor # report specific time measures # node['pilotTiming'] = "getJob=%s setup=%s stageIn=%s payload=%s stageOut=%s" % (job.timeGetJob, job.timeSetup, job.timeStageIn, job.timeExe, job.timeStageOut) node['pilotTiming'] = "%s|%s|%s|%s|%s" % (job.timeGetJob, job.timeStageIn, job.timeExe, job.timeStageOut, job.timeSetup) elif job.result[0] == 'holding': node['exeErrorCode'] = job.result[2] node['exeErrorDiag'] = self.__error.getPilotErrorDiag(job.result[2]) else: node['cpuConsumptionUnit'] = getCPUmodel() # Add the utility info if it is available thisExperiment = getExperiment(job.experiment) if thisExperiment.shouldExecuteUtility(): utility_node = thisExperiment.getUtilityInfo(job.workdir, self.__pilot_initdir, allowTxtFile=True) node = merge_dictionaries(node, utility_node) return node def getXML(self, job, sitename, workdir, xmlstr=None, jr=False): """ Get the metadata xml """ node_xml = "" tolog("getXML called") # for backwards compatibility try: experiment = job.experiment
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ copy output file from disk to local SE """ # function is based on dCacheSiteMover put function error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict alt = pdict.get('alt', False) lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') token = pdict.get('token', '') scope = pdict.get('scope', '') dsname = pdict.get('dsname', '') analysisJob = pdict.get('analJob', False) testLevel = pdict.get('testLevel', '0') extradirs = pdict.get('extradirs', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) prodSourceLabel = pdict.get('prodSourceLabel', '') # get the site information object si = getSiteInformation(experiment) if prodSourceLabel == 'ddm' and analysisJob: tolog( "Treating PanDA Mover job as a production job during stage-out" ) analysisJob = False # get the DQ2 tracing report report = self.getStubTracingReport(pdict['report'], 'lcg2', lfn, guid) # preparing variables if fsize == 0 or fchecksum == 0: ec, pilotErrorDiag, fsize, fchecksum = self.getLocalFileInfo( source, csumtype="adler32") if ec != 0: self.prepareReport('LOCAL_FILE_INFO_FAIL', report) return self.put_data_retfail(ec, pilotErrorDiag) # now that the file size is known, add it to the tracing report report['filesize'] = fsize # get the checksum type if fchecksum != 0 and fchecksum != "": csumtype = self.getChecksumType(fchecksum) else: csumtype = "default" # get a proper envsetup if alt: # use a cvmfs setup for stage-out to alternative SE envsetup = si.getLocalEMISetup() if envsetup[-1] != ";": envsetup += "; " else: envsetup = self.getEnvsetup(alt=alt) ec, pilotErrorDiag = verifySetupCommand(error, envsetup) if ec != 0: self.prepareReport('RFCP_FAIL', report) return self.put_data_retfail(ec, pilotErrorDiag) # get the experiment object thisExperiment = getExperiment(experiment) if proxycheck: s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup, limit=2) if s != 0: self.prepareReport('NO_PROXY', report) return self.put_data_retfail(error.ERR_NOPROXY, pilotErrorDiag) else: tolog("Proxy verification turned off") filename = os.path.basename(source) # get all the proper paths ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths( error, analysisJob, token, prodSourceLabel, dsname, filename, scope=scope, alt=alt) if ec != 0: self.prepareReport(tracer_error, report) return self.put_data_retfail(ec, pilotErrorDiag, surl=dst_gpfn) putfile = surl full_surl = putfile if full_surl[:len('token:')] == 'token:': # remove the space token (e.g. at Taiwan-LCG2) from the SURL info full_surl = full_surl[full_surl.index('srm://'):] # srm://dcache01.tier2.hep.manchester.ac.uk/pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/ #testpanda.destDB/testpanda.destDB.604b4fbc-dbe9-4b05-96bb-6beee0b99dee_sub0974647/ #86ecb30d-7baa-49a8-9128-107cbfe4dd90_0.job.log.tgz tolog("putfile = %s" % (putfile)) tolog("full_surl = %s" % (full_surl)) # get the DQ2 site name from ToA try: _dq2SiteName = self.getDQ2SiteName(surl=putfile) except Exception, e: tolog( "Warning: Failed to get the DQ2 site name: %s (can not add this info to tracing report)" % str(e))
def put_data(self, pfn, ddm_storage, fsize=0, fchecksum=0, dsname="", extradirs="", **pdict): """ Copy all output file to the local SE """ error = PilotErrors() pilotErrorDiag = "" tolog("put_data() got ddm_storage=%s" % (ddm_storage)) # Get input parameters from pdict lfn = pdict.get("lfn", "") guid = pdict.get("guid", "") analJob = pdict.get("analJob", False) experiment = pdict.get("experiment", "") # get the Rucio tracing report report = self.getStubTracingReport(pdict["report"], "castor", lfn, guid) # get a proper envsetup envsetup = self.getEnvsetup() ec, pilotErrorDiag = verifySetupCommand(error, envsetup) if ec != 0: self.prepareReport("RFCP_FAIL", report) return self.put_data_retfail(ec, pilotErrorDiag) # get the experiment object thisExperiment = getExperiment(experiment) # do we have a valid proxy? s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup, limit=2) if s != 0: self.prepareReport("PROXYFAIL", report) return self.put_data_retfail(s, pilotErrorDiag) filename = pfn.split("/")[-1] # the current file report["filename"] = lfn # guid report["guid"] = guid.replace("-", "") # Destination is the top level Castor store area. Append a subdirectory which is first two fields of dsname, or 'other' destination = "" if not analJob: # seprodpath can have a complex structure in case of space tokens # although currently not supported in this site mover, prepare the code anyway # (use the first list item only) destination = self.getDirList(readpar("seprodpath"))[0] if destination == "": tolog("!!WARNING!!2999!! seprodpath not defined, using sepath") destination = readpar("sepath") tolog("Going to store production job output") else: destination = readpar("sepath") tolog("Going to store analysis job output") if destination == "": pilotErrorDiag = "put_data destination path in SE not defined" tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) self.prepareReport("DEST_PATH_UNDEF", report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag) else: tolog("destination: %s" % (destination)) if dsname == "": pilotErrorDiag = "Dataset name not specified to put_data" tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) self.prepareReport("NO_DSN", report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag) # else: # dsname = self.remove_sub(dsname) # tolog("dsname: %s" % (dsname)) lfcpath, pilotErrorDiag = self.getLFCPath(analJob) if lfcpath == "": self.prepareReport("LFC_PATH_FAIL", report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag) tolog("LFC path: %s" % (lfcpath)) pat = re.compile("([^\.]+\.[^\.]+)\..*") mat = pat.match(dsname) if mat: prefixdir = mat.group(1) castor_destination = os.path.join(destination, prefixdir) else: pilotErrorDiag = "Unexpected dataset name format: %s" % (dsname) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) self.prepareReport("DSN_FORMAT_FAIL", report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag) tolog("SE destination: %s" % (castor_destination)) # set up paths differently for analysis and production jobs # use conventional LFC paths or production jobs # use OSG style for analysis jobs (for the time being) if not analJob: # return full lfc file path (beginning lfcpath might need to be replaced) native_lfc_path = self.to_native_lfn(dsname, filename) # /grid/atlas/dq2/testpanda/testpanda.destDB.b7cd4b56-1b5e-465a-a5d7-38d5e2609724_sub01000457/ # 58f836d5-ff4b-441a-979b-c37094257b72_0.job.log.tgz tolog("Native_lfc_path: %s" % (native_lfc_path)) # replace the default path /grid/atlas/rucio with lfcpath if different # (to_native_lfn returns a path begining with /grid/atlas/rucio) default_lfcpath = "/grid/atlas/rucio" # to_native_lfn always returns this at the beginning of the string if default_lfcpath != lfcpath: final_lfc_path = native_lfc_path.replace(default_lfcpath, lfcpath) else: final_lfc_path = native_lfc_path # name of dir to be created in LFC lfcdir = os.path.dirname(final_lfc_path) # /grid/atlas/dq2/testpanda/testpanda.destDB.dfb45803-1251-43bb-8e7a-6ad2b6f205be_sub01000492 tolog("LFC dir: %s" % (lfcdir)) # dst_gpfn = destination # dst_gpfn = os.path.join(destination, os.path.join(dsname, filename)) # /pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/testpanda/testpanda.destDB.dfb45803-1251-43bb-8e7a-6ad2b6f205be_sub01000492 # tolog("dst_gpfn: %s" % (dst_gpfn)) else: # for analysis jobs lfcdir = "%s/%s/%s" % (lfcpath, prefixdir, dsname) tolog("lfcdir: %s" % (lfcdir)) report["relativeStart"] = time() # name of dir to be created on Castor dirname = os.path.join(castor_destination, dsname) dst_gpfn = os.path.join(castor_destination, os.path.join(dsname, filename)) tolog("dst_gpfn: %s" % (dst_gpfn)) fppfn = os.path.abspath(pfn) # get the RSE from ToA try: _RSE = self.getRSE(surl=dst_gpfn) except Exception, e: tolog("Warning: Failed to get RSE: %s (can not add this info to tracing report)" % str(e))
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ stage-in function """ error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict useCT = pdict.get("usect", True) jobId = pdict.get("jobId", "") workDir = pdict.get("workDir", "") experiment = pdict.get("experiment", "") prodDBlockToken = pdict.get("access", "") # get the Rucio tracing report report = self.getStubTracingReport(pdict["report"], "castor", lfn, guid) # get a proper envsetup envsetup = self.getEnvsetup(get=True) ec, pilotErrorDiag = verifySetupCommand(error, envsetup) if ec != 0: self.prepareReport("RFCP_FAIL", report) return ec, pilotErrorDiag # get the experiment object thisExperiment = getExperiment(experiment) # do we have a valid proxy? s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup) if s != 0: self.prepareReport("PROXYFAIL", report) return s, pilotErrorDiag # Strip off prefix in order to use rfcp directly tolog("gpfn: %s" % (gpfn)) pat = re.compile("^.*(/castor/.*)$") mat = pat.match(gpfn) if mat: getfile = mat.group(1) else: pilotErrorDiag = "Get file not in castor: %s" % (gpfn) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) self.prepareReport("NO_FILE", report) return error.ERR_STAGEINFAILED, pilotErrorDiag # when the file has been copied we will rename it to the lfn (to remove the legacy __DQ2-string on some files) dest_path = os.path.join(path, lfn) # should the root file be copied or read directly by athena? directIn, useFileStager = self.getTransferModes() if directIn: if useCT: directIn = False tolog("Direct access mode is switched off (file will be transferred with the copy tool)") updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", ftype="input") else: # determine if the file is a root file according to its name rootFile = self.isRootFileName(lfn) if prodDBlockToken == "local" or not rootFile: directIn = False tolog( "Direct access mode has been switched off for this file (will be transferred with the copy tool)" ) updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", ftype="input") elif rootFile: tolog( "Found root file according to file name: %s (will not be transferred in direct reading mode)" % (lfn) ) report["relativeStart"] = None report["transferStart"] = None self.prepareReport("FOUND_ROOT", report) if useFileStager: updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="file_stager", ftype="input") else: updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="remote_io", ftype="input") return error.ERR_DIRECTIOFILE, pilotErrorDiag else: tolog("Normal file transfer") # transfer the input file with rfcp _cmd_str = "%srfcp %s %s" % (envsetup, getfile, dest_path) tolog("Executing command: %s" % (_cmd_str)) report["transferStart"] = time() s, o = commands.getstatusoutput(_cmd_str) report["validateStart"] = time() if s != 0: o = o.replace("\n", " ") check_syserr(s, o) # remove the local file before any get retry is attempted _status = self.removeLocal(dest_path) if not _status: tolog("!!WARNING!!1112!! Failed to remove local file, get retry will fail") if o.find("No such file or directory") >= 0: if getfile.find("DBRelease") >= 0: pilotErrorDiag = "Missing DBRelease file: %s" % (getfile) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) ec = error.ERR_MISSDBREL else: pilotErrorDiag = "No such file or directory: %s" % (getfile) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) ec = error.ERR_NOSUCHFILE else: pilotErrorDiag = "rfcp failed: %d, %s" % (s, o) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) ec = error.ERR_STAGEINFAILED self.prepareReport("RFCP_FAIL", report) return ec, pilotErrorDiag # check file size and checksum if fsize != 0 or fchecksum != 0: # which checksum type are we using? if fchecksum != 0 and fchecksum != "": csumtype = self.getChecksumType(fchecksum) else: csumtype = "default" # get remote file size and checksum ec, pilotErrorDiag, dstfsize, dstfchecksum = self.getLocalFileInfo(dest_path, csumtype=csumtype) tolog("File info: %d, %s, %s" % (ec, dstfsize, dstfchecksum)) if ec != 0: self.prepareReport("LOCAL_FILE_INFO_FAIL", report) # remove the local file before any get retry is attempted _status = self.removeLocal(dest_path) if not _status: tolog("!!WARNING!!1112!! Failed to remove local file, get retry will fail") return ec, pilotErrorDiag # compare remote and local file size if fsize != 0 and dstfsize != fsize: pilotErrorDiag = "Remote and local file sizes do not match for %s (%s != %s)" % ( os.path.basename(gpfn), str(dstfsize), str(fsize), ) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) self.prepareReport("FS_MISMATCH", report) # remove the local file before any get retry is attempted _status = self.removeLocal(dest_path) if not _status: tolog("!!WARNING!!1112!! Failed to remove local file, get retry will fail") return error.ERR_GETWRONGSIZE, pilotErrorDiag # compare remote and local file checksum if fchecksum != 0 and dstfchecksum != fchecksum and not self.isDummyChecksum(fchecksum): pilotErrorDiag = "Remote and local checksums (of type %s) do not match for %s (%s != %s)" % ( csumtype, os.path.basename(gpfn), dstfchecksum, fchecksum, ) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) # remove the local file before any get retry is attempted _status = self.removeLocal(dest_path) if not _status: tolog("!!WARNING!!1112!! Failed to remove local file, get retry will fail") if csumtype == "adler32": self.prepareReport("AD_MISMATCH", report) return error.ERR_GETADMISMATCH, pilotErrorDiag else: self.prepareReport("MD5_MISMATCH", report) return error.ERR_GETMD5MISMATCH, pilotErrorDiag updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", ftype="input") self.prepareReport("DONE", report) return 0, pilotErrorDiag
class curlSiteMover(SiteMover.SiteMover): """ SiteMover for curl """ copyCommand = "curl" checksum_command = "adler32" has_mkdir = False has_df = False has_getsize = False has_md5sum = True has_chmod = False timeout = 3600 """ get proxy """ si = SiteInformation() sslCert = si.getSSLCertificate() sslKey = sslCert sslCertDir = si.getSSLCertificatesDirectory() def __init__(self, setup_path, *args, **kwrds): self._setup = setup_path def get_timeout(self): return self.timeout def check_space(self, ub): """ For when space availability is not verifiable """ return 999999 def core_get_data(self, envsetup, token, source_surl, dest_path, experiment): """ stage-in core function, can be overridden (see stormSiteMover) """ error = PilotErrors() # determine which timeout option to use timeout_option = "--connect-timeout 300 --max-time %d" % (self.timeout) sslCert = self.sslCert sslKey = self.sslKey sslCertDir = self.sslCertDir # used curl options: # --cert: <cert[:passwd]> Client certificate file and password (SSL) # --capath: <directory> CA directory (made using c_rehash) to verify # --location: Follow Location: hints (H) # --output: <file> Write output to <file> instead of stdout # --cilent: Makes Curl mute # --show-error: When used with -s it makes curl show error message if it fails # Removed for SL6: --ciphers <list of ciphers> (SSL) Specifies which ciphers to use in the connection. """ define curl command string """ _cmd_str = 'lcg-gt %s https' % (source_surl) try: s, o = commands.getstatusoutput(_cmd_str) tolog("Executing command: %s" % (_cmd_str)) except Exception, e: tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" % (str(e), s, o)) o = str(e) if s == 0: tolog("lcg-gt supported, get http path") source_surl = o.strip().split() source_surl = source_surl[0] _cmd_str = '%s curl --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % ( envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey, source_surl, dest_path) # _cmd_str = '%s curl --ciphers ALL:NULL --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % (envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey, source_surl, dest_path) else: tolog( "lcg-gt not supported, get http path by replacing source_surl") _cmd_str = '%s curl --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % ( envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey, source_surl, dest_path) # _cmd_str = '%s curl --ciphers ALL:NULL --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % (envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey, source_surl, dest_path) _cmd_str = _cmd_str.replace("srm://", "https://") # add the full stage-out command to the job setup script #_cmd_str = _cmd_str.replace("file://", "-o ") # get the experiment object thisExperiment = getExperiment(experiment) to_script = _cmd_str to_script = to_script.lstrip(' ') # remove any initial spaces if to_script.startswith('/'): to_script = 'source ' + to_script thisExperiment.updateJobSetupScript(os.path.dirname(dest_path), to_script=to_script) tolog("Executing command: %s" % (_cmd_str)) s = -1 o = '(not defined)' t0 = os.times() try: s, o = commands.getstatusoutput(_cmd_str) except Exception, e: tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" % (str(e), s, o)) o = str(e)
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ copy output file from disk to local SE """ # function is based on dCacheSiteMover put function error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict alt = pdict.get('alt', False) lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') token = pdict.get('token', '') scope = pdict.get('scope', '') dsname = pdict.get('dsname', '') analysisJob = pdict.get('analJob', False) testLevel = pdict.get('testLevel', '0') extradirs = pdict.get('extradirs', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) prodSourceLabel = pdict.get('prodSourceLabel', '') # get the site information object si = getSiteInformation(experiment) if prodSourceLabel == 'ddm' and analysisJob: tolog("Treating PanDA Mover job as a production job during stage-out") analysisJob = False # get the DQ2 tracing report report = self.getStubTracingReport(pdict['report'], 'lcg2', lfn, guid) # preparing variables if fsize == 0 or fchecksum == 0: ec, pilotErrorDiag, fsize, fchecksum = self.getLocalFileInfo(source, csumtype="adler32") if ec != 0: self.__sendReport('LOCAL_FILE_INFO_FAIL', report) return self.put_data_retfail(ec, pilotErrorDiag) # now that the file size is known, add it to the tracing report report['filesize'] = fsize # get the checksum type if fchecksum != 0 and fchecksum != "": csumtype = self.getChecksumType(fchecksum) else: csumtype = "default" # get a proper envsetup if alt: # use a cvmfs setup for stage-out to alternative SE envsetup = si.getLocalEMISetup() if envsetup[-1] != ";": envsetup += "; " else: envsetup = self.getEnvsetup(alt=alt) ec, pilotErrorDiag = verifySetupCommand(error, envsetup) if ec != 0: self.__sendReport('RFCP_FAIL', report) return self.put_data_retfail(ec, pilotErrorDiag) # get the experiment object thisExperiment = getExperiment(experiment) if proxycheck: s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup, limit=2) if s != 0: self.__sendReport('NO_PROXY', report) return self.put_data_retfail(error.ERR_NOPROXY, pilotErrorDiag) else: tolog("Proxy verification turned off") filename = os.path.basename(source) # get all the proper paths ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths(error, analysisJob, token, prodSourceLabel, dsname, filename, scope=scope, alt=alt) if ec != 0: self.__sendReport(tracer_error, report) return self.put_data_retfail(ec, pilotErrorDiag) putfile = surl full_surl = putfile if full_surl[:len('token:')] == 'token:': # remove the space token (e.g. at Taiwan-LCG2) from the SURL info full_surl = full_surl[full_surl.index('srm://'):] # srm://dcache01.tier2.hep.manchester.ac.uk/pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/ #testpanda.destDB/testpanda.destDB.604b4fbc-dbe9-4b05-96bb-6beee0b99dee_sub0974647/ #86ecb30d-7baa-49a8-9128-107cbfe4dd90_0.job.log.tgz tolog("putfile = %s" % (putfile)) tolog("full_surl = %s" % (full_surl)) # get the DQ2 site name from ToA try: _dq2SiteName = self.getDQ2SiteName(surl=putfile) except Exception, e: tolog("Warning: Failed to get the DQ2 site name: %s (can not add this info to tracing report)" % str(e))
def put_data(self, pfn, ddm_storage, fsize=0, fchecksum=0, dsname='', extradirs='', **pdict): """ Copy all output file to the local SE """ error = PilotErrors() pilotErrorDiag = "" tolog("put_data() got ddm_storage=%s" % (ddm_storage)) # Get input parameters from pdict lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') analJob = pdict.get('analJob', False) experiment = pdict.get('experiment', '') # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 'castor', lfn, guid) # get a proper envsetup envsetup = self.getEnvsetup() ec, pilotErrorDiag = verifySetupCommand(error, envsetup) if ec != 0: self.prepareReport('RFCP_FAIL', report) return self.put_data_retfail(ec, pilotErrorDiag) # get the experiment object thisExperiment = getExperiment(experiment) # do we have a valid proxy? s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup, limit=2) if s != 0: self.prepareReport('PROXYFAIL', report) return self.put_data_retfail(s, pilotErrorDiag) filename = pfn.split('/')[-1] # the current file report['filename'] = lfn # guid report['guid'] = guid.replace('-', '') # Destination is the top level Castor store area. Append a subdirectory which is first two fields of dsname, or 'other' destination = "" if not analJob: # seprodpath can have a complex structure in case of space tokens # although currently not supported in this site mover, prepare the code anyway # (use the first list item only) destination = self.getDirList(readpar('seprodpath'))[0] if destination == "": tolog("!!WARNING!!2999!! seprodpath not defined, using sepath") destination = readpar('sepath') tolog("Going to store production job output") else: destination = readpar('sepath') tolog("Going to store analysis job output") if destination == '': pilotErrorDiag = "put_data destination path in SE not defined" tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) self.prepareReport('DEST_PATH_UNDEF', report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag) else: tolog("destination: %s" % (destination)) if dsname == '': pilotErrorDiag = "Dataset name not specified to put_data" tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) self.prepareReport('NO_DSN', report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag) # else: # dsname = self.remove_sub(dsname) # tolog("dsname: %s" % (dsname)) lfcpath, pilotErrorDiag = self.getLFCPath(analJob) if lfcpath == "": self.prepareReport('LFC_PATH_FAIL', report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag) tolog("LFC path: %s" % (lfcpath)) pat = re.compile('([^\.]+\.[^\.]+)\..*') mat = pat.match(dsname) if mat: prefixdir = mat.group(1) castor_destination = os.path.join(destination, prefixdir) else: pilotErrorDiag = "Unexpected dataset name format: %s" % (dsname) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) self.prepareReport('DSN_FORMAT_FAIL', report) return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag) tolog("SE destination: %s" % (castor_destination)) # set up paths differently for analysis and production jobs # use conventional LFC paths or production jobs # use OSG style for analysis jobs (for the time being) if not analJob: # return full lfc file path (beginning lfcpath might need to be replaced) native_lfc_path = self.to_native_lfn(dsname, filename) # /grid/atlas/dq2/testpanda/testpanda.destDB.b7cd4b56-1b5e-465a-a5d7-38d5e2609724_sub01000457/ #58f836d5-ff4b-441a-979b-c37094257b72_0.job.log.tgz tolog("Native_lfc_path: %s" % (native_lfc_path)) # replace the default path /grid/atlas/rucio with lfcpath if different # (to_native_lfn returns a path begining with /grid/atlas/rucio) default_lfcpath = '/grid/atlas/rucio' # to_native_lfn always returns this at the beginning of the string if default_lfcpath != lfcpath: final_lfc_path = native_lfc_path.replace( default_lfcpath, lfcpath) else: final_lfc_path = native_lfc_path # name of dir to be created in LFC lfcdir = os.path.dirname(final_lfc_path) # /grid/atlas/dq2/testpanda/testpanda.destDB.dfb45803-1251-43bb-8e7a-6ad2b6f205be_sub01000492 tolog("LFC dir: %s" % (lfcdir)) # dst_gpfn = destination # dst_gpfn = os.path.join(destination, os.path.join(dsname, filename)) # /pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/testpanda/testpanda.destDB.dfb45803-1251-43bb-8e7a-6ad2b6f205be_sub01000492 # tolog("dst_gpfn: %s" % (dst_gpfn)) else: # for analysis jobs lfcdir = '%s/%s/%s' % (lfcpath, prefixdir, dsname) tolog("lfcdir: %s" % (lfcdir)) report['relativeStart'] = time() # name of dir to be created on Castor dirname = os.path.join(castor_destination, dsname) dst_gpfn = os.path.join(castor_destination, os.path.join(dsname, filename)) tolog("dst_gpfn: %s" % (dst_gpfn)) fppfn = os.path.abspath(pfn) # get the RSE from ToA try: _RSE = self.getRSE(surl=dst_gpfn) except Exception, e: tolog( "Warning: Failed to get RSE: %s (can not add this info to tracing report)" % str(e))
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict): """ stage-in function """ error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict useCT = pdict.get('usect', True) jobId = pdict.get('jobId', '') workDir = pdict.get('workDir', '') experiment = pdict.get('experiment', '') prodDBlockToken = pdict.get('access', '') # get the Rucio tracing report report = self.getStubTracingReport(pdict['report'], 'castor', lfn, guid) # get a proper envsetup envsetup = self.getEnvsetup(get=True) ec, pilotErrorDiag = verifySetupCommand(error, envsetup) if ec != 0: self.prepareReport('RFCP_FAIL', report) return ec, pilotErrorDiag # get the experiment object thisExperiment = getExperiment(experiment) # do we have a valid proxy? s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup) if s != 0: self.prepareReport('PROXYFAIL', report) return s, pilotErrorDiag # Strip off prefix in order to use rfcp directly tolog("gpfn: %s" % (gpfn)) pat = re.compile('^.*(/castor/.*)$') mat = pat.match(gpfn) if mat: getfile = mat.group(1) else: pilotErrorDiag = "Get file not in castor: %s" % (gpfn) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) self.prepareReport('NO_FILE', report) return error.ERR_STAGEINFAILED, pilotErrorDiag # when the file has been copied we will rename it to the lfn (to remove the legacy __DQ2-string on some files) dest_path = os.path.join(path, lfn) # should the root file be copied or read directly by athena? directIn, useFileStager = self.getTransferModes() if directIn: if useCT: directIn = False tolog( "Direct access mode is switched off (file will be transferred with the copy tool)" ) updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", ftype="input") else: # determine if the file is a root file according to its name rootFile = self.isRootFileName(lfn) if prodDBlockToken == 'local' or not rootFile: directIn = False tolog( "Direct access mode has been switched off for this file (will be transferred with the copy tool)" ) updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", ftype="input") elif rootFile: tolog( "Found root file according to file name: %s (will not be transferred in direct reading mode)" % (lfn)) report['relativeStart'] = None report['transferStart'] = None self.prepareReport('FOUND_ROOT', report) if useFileStager: updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="file_stager", ftype="input") else: updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="remote_io", ftype="input") return error.ERR_DIRECTIOFILE, pilotErrorDiag else: tolog("Normal file transfer") # transfer the input file with rfcp _cmd_str = '%srfcp %s %s' % (envsetup, getfile, dest_path) tolog("Executing command: %s" % (_cmd_str)) report['transferStart'] = time() s, o = commands.getstatusoutput(_cmd_str) report['validateStart'] = time() if s != 0: o = o.replace('\n', ' ') check_syserr(s, o) # remove the local file before any get retry is attempted _status = self.removeLocal(dest_path) if not _status: tolog( "!!WARNING!!1112!! Failed to remove local file, get retry will fail" ) if o.find("No such file or directory") >= 0: if getfile.find("DBRelease") >= 0: pilotErrorDiag = "Missing DBRelease file: %s" % (getfile) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) ec = error.ERR_MISSDBREL else: pilotErrorDiag = "No such file or directory: %s" % ( getfile) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) ec = error.ERR_NOSUCHFILE else: pilotErrorDiag = "rfcp failed: %d, %s" % (s, o) tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag)) ec = error.ERR_STAGEINFAILED self.prepareReport('RFCP_FAIL', report) return ec, pilotErrorDiag # check file size and checksum if fsize != 0 or fchecksum != 0: # which checksum type are we using? if fchecksum != 0 and fchecksum != "": csumtype = self.getChecksumType(fchecksum) else: csumtype = "default" # get remote file size and checksum ec, pilotErrorDiag, dstfsize, dstfchecksum = self.getLocalFileInfo( dest_path, csumtype=csumtype) tolog("File info: %d, %s, %s" % (ec, dstfsize, dstfchecksum)) if ec != 0: self.prepareReport('LOCAL_FILE_INFO_FAIL', report) # remove the local file before any get retry is attempted _status = self.removeLocal(dest_path) if not _status: tolog( "!!WARNING!!1112!! Failed to remove local file, get retry will fail" ) return ec, pilotErrorDiag # compare remote and local file size if fsize != 0 and dstfsize != fsize: pilotErrorDiag = "Remote and local file sizes do not match for %s (%s != %s)" %\ (os.path.basename(gpfn), str(dstfsize), str(fsize)) tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag)) self.prepareReport('FS_MISMATCH', report) # remove the local file before any get retry is attempted _status = self.removeLocal(dest_path) if not _status: tolog( "!!WARNING!!1112!! Failed to remove local file, get retry will fail" ) return error.ERR_GETWRONGSIZE, pilotErrorDiag # compare remote and local file checksum if fchecksum != 0 and dstfchecksum != fchecksum and not self.isDummyChecksum( fchecksum): pilotErrorDiag = "Remote and local checksums (of type %s) do not match for %s (%s != %s)" %\ (csumtype, os.path.basename(gpfn), dstfchecksum, fchecksum) tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag)) # remove the local file before any get retry is attempted _status = self.removeLocal(dest_path) if not _status: tolog( "!!WARNING!!1112!! Failed to remove local file, get retry will fail" ) if csumtype == "adler32": self.prepareReport('AD_MISMATCH', report) return error.ERR_GETADMISMATCH, pilotErrorDiag else: self.prepareReport('MD5_MISMATCH', report) return error.ERR_GETMD5MISMATCH, pilotErrorDiag updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", ftype="input") self.prepareReport('DONE', report) return 0, pilotErrorDiag
def getUtilityInfo(self, node, experiment, workdir): """ Add the utility info to the node structure if available """ # Get the experiment object and check if the special utility (e.g. a memory monitor) was used thisExperiment = getExperiment(experiment) if thisExperiment.shouldExecuteUtility(): # Try to get the memory monitor info from the workdir first path = os.path.join(workdir, thisExperiment.getUtilityJSONFilename()) init_path = os.path.join(self.__pilot_initdir, thisExperiment.getUtilityJSONFilename()) primary_location = False if not os.path.exists(path): tolog("File does not exist: %s" % (path)) if os.path.exists(init_path): path = init_path else: tolog("File does not exist either: %s" % (path)) path = "" primary_location = False else: primary_location = True if path != "": tolog("Reading memory monitoring info from: %s" % (path)) # If the file is the primary one (ie the one in the workdir and not the initdir, then also check the modification time) read_from_file = True if primary_location: # Get the modification time mod_time = None max_time = 120 try: file_modification_time = os.path.getmtime(path) current_time = int(time()) mod_time = current_time - file_modification_time tolog("File %s was modified %d seconds ago" % (mod_time)) except: tolog("!!WARNING!!2323!! Could not read the modification time of %s" % (path)) tolog("!!WARNING!!2324!! Will add -1 values for the memory info") node['maxRSS'] = -1 node['maxVMEM'] = -1 node['maxSWAP'] = -1 node['maxPSS'] = -1 node['avgRSS'] = -1 node['avgVMEM'] = -1 node['avgSWAP'] = -1 node['avgPSS'] = -1 read_from_file = False else: if mod_time > max_time: tolog("!!WARNING!!2325!! File %s was modified over %d s ago, will add -1 values for the memory info" % (path, max_time)) node['maxRSS'] = -1 node['maxVMEM'] = -1 node['maxSWAP'] = -1 node['maxPSS'] = -1 node['avgRSS'] = -1 node['avgVMEM'] = -1 node['avgSWAP'] = -1 node['avgPSS'] = -1 read_from_file = False if read_from_file: # Get the dictionary d = getJSONDictionary(path) if d and d != {}: try: # Move to experiment class? node['maxRSS'] = d['Max']['maxRSS'] node['maxVMEM'] = d['Max']['maxVMEM'] node['maxSWAP'] = d['Max']['maxSwap'] node['maxPSS'] = d['Max']['maxPSS'] node['avgRSS'] = d['Avg']['avgRSS'] node['avgVMEM'] = d['Avg']['avgVMEM'] node['avgSWAP'] = d['Avg']['avgSwap'] node['avgPSS'] = d['Avg']['avgPSS'] except Exception, e: tolog("!!WARNING!!54541! Exception caught while parsing memory monitor JSON: %s" % (e)) else: tolog("Extracted info from memory monitor JSON") # Done with the memory monitor for this job (if the file is read from the pilots' init dir), remove the file in case there are other jobs to be run if os.path.exists(init_path): try: os.system("rm -rf %s" % (init_path)) except Exception, e: tolog("!!WARNING!!4343!! Failed to remove %s: %s" % (init_path), e) else: tolog("Removed %s" % (init_path))
def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict): """ copy output file from disk to local SE """ # function is based on dCacheSiteMover put function error = PilotErrors() pilotErrorDiag = "" # Get input parameters from pdict lfn = pdict.get('lfn', '') guid = pdict.get('guid', '') token = pdict.get('token', '') scope = pdict.get('scope', '') dsname = pdict.get('dsname', '') analysisJob = pdict.get('analJob', False) testLevel = pdict.get('testLevel', '0') extradirs = pdict.get('extradirs', '') experiment = pdict.get('experiment', '') proxycheck = pdict.get('proxycheck', False) prodSourceLabel = pdict.get('prodSourceLabel', '') # get the site information object si = getSiteInformation(experiment) tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel)) if prodSourceLabel == 'ddm' and analysisJob: tolog( "Treating PanDA Mover job as a production job during stage-out" ) analysisJob = False # get the Rucio tracing report try: report = pdict['report'] except: report = {} else: # set the proper protocol report['protocol'] = 'curl' # mark the relative start report['catStart'] = time() # the current file report['filename'] = lfn # guid report['guid'] = guid.replace('-', '') # preparing variables if fsize == 0 or fchecksum == 0: ec, pilotErrorDiag, fsize, fchecksum = self.getLocalFileInfo( source, csumtype="adler32") if ec != 0: self.prepareReport('LOCAL_FILE_INFO_FAIL', report) return self.put_data_retfail(ec, pilotErrorDiag) # now that the file size is known, add it to the tracing report report['filesize'] = fsize # get the checksum type if fchecksum != 0 and fchecksum != "": csumtype = self.getChecksumType(fchecksum) else: csumtype = "default" # get a proper envsetup envsetup = self.getEnvsetup() # get the experiment object thisExperiment = getExperiment(experiment) if proxycheck: s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup, limit=2) if s != 0: self.prepareReport('NO_PROXY', report) return self.put_data_retfail(error.ERR_NOPROXY, pilotErrorDiag) else: tolog("Proxy verification turned off") filename = os.path.basename(source) # get all the proper paths ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths( error, analysisJob, token, prodSourceLabel, dsname, filename, scope=scope, sitemover=self) # quick workaround if ec != 0: self.prepareReport(tracer_error, report) return self.put_data_retfail(ec, pilotErrorDiag) putfile = surl full_surl = putfile if full_surl[:len('token:')] == 'token:': # remove the space token (e.g. at Taiwan-LCG2) from the SURL info full_surl = full_surl[full_surl.index('srm://'):] # srm://dcache01.tier2.hep.manchester.ac.uk/pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/ #testpanda.destDB/testpanda.destDB.604b4fbc-dbe9-4b05-96bb-6beee0b99dee_sub0974647/ #86ecb30d-7baa-49a8-9128-107cbfe4dd90_0.job.log.tgz tolog("putfile: %s" % (putfile)) tolog("full_surl: %s" % (full_surl)) # get https surl full_http_surl = full_surl.replace("srm://", "https://") # get the RSE from ToA try: _RSE = self.getRSE(surl=putfile) except Exception, e: tolog( "Warning: Failed to get RSE: %s (can not add this info to tracing report)" % str(e))