def stageIn(job, jobSite, analJob, pilot_initdir, pworkdir): """ perform the stage-in """ ec = 0 statusPFCTurl = None usedFAXandDirectIO = False # prepare the input files (remove non-valid names) if there are any ins, job.filesizeIn, job.checksumIn = RunJobUtilities.prepareInFiles(job.inFiles, job.filesizeIn, job.checksumIn) if ins: tolog("Preparing for get command") # get the file access info (only useCT is needed here) useCT, oldPrefix, newPrefix, useFileStager, directIn = getFileAccessInfo() # transfer input files tin_0 = os.times() ec, job.pilotErrorDiag, statusPFCTurl, job.filesWithoutFAX, job.filesWithFAX, usedFAXandDirectIO = \ mover.get_data(job, jobSite, ins, stageinretry, analysisJob=analJob, usect=useCT,\ pinitdir=pilot_initdir, proxycheck=False, inputDir=inputDir, workDir=pworkdir) if ec != 0: job.result[2] = ec tin_1 = os.times() job.timeStageIn = int(round(tin_1[4] - tin_0[4])) return job, ins, statusPFCTurl, usedFAXandDirectIO
def failJob(transExitCode, pilotExitCode, job, pilotserver, pilotport, ins=None, pilotErrorDiag=None, docleanup=True): """ set the fail code and exit """ job.setState(["failed", transExitCode, pilotExitCode]) if pilotErrorDiag: job.pilotErrorDiag = pilotErrorDiag tolog("Will now update local pilot TCP server") rt = RunJobUtilities.updatePilotServer(job, pilotserver, pilotport, final=True) if ins: ec = pUtil.removeFiles(job.workdir, ins) if docleanup: sysExit(job)
def stageInHPCEvent(self): tolog("Setting stage-in state until all input files have been copied") self.__job.jobState = "transferring" self.__job.setState([self.__job.jobState, 0, 0]) rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test") # stage-in all input files (if necessary) job, ins, statusPFCTurl, usedFAXandDirectIO = self.stageIn(self.__job, self.__jobSite, self.__analysisJob, pfc_name="PFC.xml") if job.result[2] != 0: tolog("Failing job with ec: %d" % (job.result[2])) self.failJob(0, job.result[2], job, ins=ins, pilotErrorDiag=job.pilotErrorDiag) self.__job = job self.__job.displayJob()
def createFileMetadata(outFiles, job, outsDict, dsname, datasetDict, sitename): """ create the metadata for the output + log files """ ec = 0 # get/assign guids to the output files if outFiles: tolog("outFiles=%s"%str(outFiles)) if not pUtil.isBuildJob(outFiles): ec, job.pilotErrorDiag, job.outFilesGuids = RunJobUtilities.getOutFilesGuids(job.outFiles, job.workdir) if ec: # missing PoolFileCatalog (only error code from getOutFilesGuids) return ec, job, None else: tolog("Build job - do not use PoolFileCatalog to get guid (generated)") else: tolog("This job has no output files") # get the file sizes and checksums for the local output files # WARNING: any errors are lost if occur in getOutputFileInfo() ec, pilotErrorDiag, fsize, checksum = pUtil.getOutputFileInfo(list(outFiles), getChecksumCommand(), skiplog=True, logFile=job.logFile) if ec != 0: tolog("!!FAILED!!2999!! %s" % (pilotErrorDiag)) failJob(job.result[1], ec, job, pilotserver, pilotport, pilotErrorDiag=pilotErrorDiag) if logguid: guid = logguid else: guid = job.tarFileGuid # create preliminary metadata (no metadata yet about log file - added later in pilot.py) _fname = "%s/metadata-%d.xml" % (job.workdir, job.jobId) try: _status = pUtil.PFCxml(job.experiment, _fname, list(job.outFiles), fguids=job.outFilesGuids, fntag="lfn", alog=job.logFile, alogguid=guid,\ fsize=fsize, checksum=checksum, analJob=analJob) except Exception, e: pilotErrorDiag = "PFCxml failed due to problematic XML: %s" % (e) tolog("!!WARNING!!1113!! %s" % (pilotErrorDiag)) failJob(job.result[1], error.ERR_MISSINGGUID, job, pilotserver, pilotport, pilotErrorDiag=pilotErrorDiag)
def stageInHPCEvent(self): tolog("Setting stage-in state until all input files have been copied") self.__job.jobState = "transferring" self.__job.setState([self.__job.jobState, 0, 0]) rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test") # stage-in all input files (if necessary) job, ins, statusPFCTurl, usedFAXandDirectIO = self.stageIn( self.__job, self.__jobSite, self.__analysisJob, pfc_name="PFC.xml") if job.result[2] != 0: tolog("Failing job with ec: %d" % (job.result[2])) self.failJob(0, job.result[2], job, ins=ins, pilotErrorDiag=job.pilotErrorDiag) self.__job = job self.__job.displayJob()
def setup(job, jobSite, thisExperiment): """ prepare the setup and get the run command list """ # start setup time counter t0 = time.time() ec = 0 runCommandList = [] # split up the job parameters to be able to loop over the tasks jobParameterList = job.jobPars.split("\n") jobHomePackageList = job.homePackage.split("\n") jobTrfList = job.trf.split("\n") jobAtlasRelease = getAtlasRelease(job.atlasRelease) tolog("Number of transformations to process: %s" % len(jobParameterList)) if len(jobParameterList) > 1: multi_trf = True else: multi_trf = False # verify that the multi-trf job is setup properly ec, job.pilotErrorDiag, jobAtlasRelease = RunJobUtilities.verifyMultiTrf(jobParameterList, jobHomePackageList, jobTrfList, jobAtlasRelease) if ec > 0: return ec, runCommandList, job, multi_trf os.chdir(jobSite.workdir) tolog("Current job workdir is %s" % os.getcwd()) # setup the trf(s) _i = 0 _stdout = job.stdout _stderr = job.stderr _first = True for (_jobPars, _homepackage, _trf, _swRelease) in map(None, jobParameterList, jobHomePackageList, jobTrfList, jobAtlasRelease): tolog("Preparing setup %d/%d" % (_i + 1, len(jobParameterList))) # reset variables job.jobPars = _jobPars job.homePackage = _homepackage job.trf = _trf job.atlasRelease = _swRelease if multi_trf: job.stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1)) job.stderr = _stderr.replace(".txt", "_%d.txt" % (_i + 1)) # post process copysetup variable in case of directIn/useFileStager _copysetup = readpar('copysetup') _copysetupin = readpar('copysetupin') if "--directIn" in job.jobPars or "--useFileStager" in job.jobPars or _copysetup.count('^') == 5 or _copysetupin.count('^') == 5: # only need to update the queuedata file once if _first: RunJobUtilities.updateCopysetups(job.jobPars) _first = False # setup the trf ec, job.pilotErrorDiag, cmd, job.spsetup, job.JEM, job.cmtconfig = thisExperiment.getJobExecutionCommand(job, jobSite, pilot_initdir) if ec > 0: # setup failed break # add the setup command to the command list runCommandList.append(cmd) _i += 1 job.stdout = _stdout job.stderr = _stderr job.timeSetup = int(time.time() - t0) tolog("Total setup time: %d s" % (job.timeSetup)) return ec, runCommandList, job, multi_trf
tolog("Warning: Could not copy metadata-%d.xml to site work dir - ddm Adder problems will occure in case of job recovery" % \ (job.jobId)) if job.result[0] == 'holding' and job.result[1] == 0: try: # create the data directory os.makedirs(job.datadir) except OSError, e: tolog("!!WARNING!!3000!! Could not create data directory: %s, %s" % (job.datadir, str(e))) else: # find all remaining files in case 'rf' is not empty remaining_files = [] moved_files_list = [] try: if rf != None: moved_files_list = RunJobUtilities.getFileNamesFromString(rf[1]) remaining_files = RunJobUtilities.getRemainingFiles(moved_files_list, job.outFiles) except Exception, e: tolog("!!WARNING!!3000!! Illegal return value from Mover: %s, %s" % (str(rf), str(e))) remaining_files = job.outFiles # move all remaining output files to the data directory nr_moved = 0 for _file in remaining_files: try: os.system("mv %s %s" % (_file, job.datadir)) except OSError, e: tolog("!!WARNING!!3000!! Failed to move file %s (abort all)" % (_file)) break else: nr_moved += 1
def executePayload(self, thisExperiment, job): t0 = os.times() res_tuple = None # loop over all run commands (only >1 for multi-trfs) getstatusoutput_was_interrupted = False job_status = None tolog("About to launch ARGO job") # Poll MQ for Job Status try: # Initiate MQ interface and send job self.argo_job.job_status_routing_key = '%s_job_status' % job.jobId #'status_' + jobID si = SiteInformation() mi = MessageInterface() mi.host = 'atlasgridftp02.hep.anl.gov' mi.port = 5671 mi.ssl_cert = si.getSSLCertificate() #'/grid/atlas/hpc/pilot_certs/xrootdsrv-cert.pem' proxy_cert_path = si.getSSLCertificate() mi.ssl_cert = os.path.dirname(proxy_cert_path) + "/rabbitmq-cert.pem" if 'X509_USER_CERT' in os.environ.keys(): mi.ssl_cert = os.environ['X509_USER_CERT'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-cert.pem' mi.ssl_key = mi.ssl_cert #'/grid/atlas/hpc/pilot_certs/xrootdsrv-key.pem' mi.ssl_key = os.path.dirname(proxy_cert_path) + "/rabbitmq-key.pem" if 'X509_USER_KEY' in os.environ.keys(): mi.ssl_key = os.environ['X509_USER_KEY'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-key.pem' #mi.ssl_ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem" mi.ssl_ca_certs = '/grid/atlas/hpc/pilot_certs/cacerts.pem' #if 'X509_CA_CERTS' in os.environ.keys(): # mi.ssl_ca_certs = os.environ['X509_CA_CERTS'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/cacerts.pem' #tolog("CA certs: %s" % (mi.ssl_ca_certs)) ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem" if os.path.isfile(ca_certs): mi.ssl_ca_certs = ca_certs mi.exchange_name = 'argo_users' #Create queue to get messages about ARGO Job status from MQ tolog('Opening connection with MQ') mi.open_blocking_connection() tolog('Create queue [%s] to retrieve messages with job status' % self.argo_job.job_status_routing_key) mi.create_queue(self.argo_job.job_status_routing_key, self.argo_job.job_status_routing_key) # submit ARGO job to MQ #tolog('Opening connection with MQ') #mi.open_blocking_connection() routing_key = 'argo_job' if self.dev: routing_key = 'argo_job_dev' tolog('Sending msg with job to ARGO') mi.send_msg(self.argo_job.serialize(), routing_key) tolog(' done sending ') # Waiting till job done or failed ARGO_err_msg = '' while True: time.sleep(5) message = mi.receive_msg(self.argo_job.job_status_routing_key, True) if message[2]: tolog ("Got message from queue [%s]: method [%s], properties [%s], body [ %s ]" % (self.argo_job.job_status_routing_key, message[0], message[1], message[2])) job_status = ArgoJobStatus.get_from_message(message[2]) job.hpcStatus = job_status.state rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort()) tolog("Extracted state: %s" % job_status.state) if job_status.state == job_status.HISTORY: res_tuple = (0, "Done") break elif job_status.is_failed(): res_tuple = (1, "Failed") ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message elif job_status.state == job_status.FAILED: res_tuple = (1, "Failed") ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message runJob.failJob(1, 0, job, ins=job.inFiles, pilotErrorDiag=ARGO_err_msg) break time.sleep(5) mi.close() tolog(' closing connection to MQ') tolog("Job State: %s" % (job_status.state)) #job.timeExe = int(fork_job.finished - fork_job.started) #################################################### except Exception, e: tolog("!!FAILED!!3000!! Failed to run command %s" % str(e)) getstatusoutput_was_interrupted = True res_tuple = (1, "Failed") self.failJob(0, self.__error.ERR_GENERALERROR, job, pilotErrorDiag=str(e))
class RunJobHpcEvent(RunJob): # private data members __runjob = "RunJobHpcEvent" # String defining the sub class __instance = None # Boolean used by subclasses to become a Singleton #__error = PilotErrors() # PilotErrors object # Required methods def __init__(self): """ Default initialization """ # e.g. self.__errorLabel = errorLabel pass self.__output_es_files = [] self.__eventRanges = {} self.__failedStageOuts = [] self._hpcManager = None def __new__(cls, *args, **kwargs): """ Override the __new__ method to make the class a singleton """ if not cls.__instance: cls.__instance = super(RunJob, cls).__new__(cls, *args, **kwargs) return cls.__instance def getRunJob(self): """ Return a string with the experiment name """ return self.__runjob def getRunJobFileName(self): """ Return the filename of the module """ return super(RunJobHpcEvent, self).getRunJobFileName() # def argumentParser(self): <-- see example in RunJob.py def allowLoopingJobKiller(self): """ Should the pilot search for looping jobs? """ # The pilot has the ability to monitor the payload work directory. If there are no updated files within a certain # time limit, the pilot will consider the as stuck (looping) and will kill it. The looping time limits are set # in environment.py (see e.g. loopingLimitDefaultProd) return False def setupHPCEvent(self): self.__jobSite = Site.Site() self.__jobSite.setSiteInfo(self.argumentParser()) ## For HPC job, we don't need to reassign the workdir # reassign workdir for this job self.__jobSite.workdir = self.__jobSite.wntmpdir if not os.path.exists(self.__jobSite.workdir): os.makedirs(self.__jobSite.workdir) tolog("runJobHPCEvent.getPilotLogFilename=%s" % self.getPilotLogFilename()) if self.getPilotLogFilename() != "": pUtil.setPilotlogFilename(self.getPilotLogFilename()) # set node info self.__node = Node.Node() self.__node.setNodeName(os.uname()[1]) self.__node.collectWNInfo(self.__jobSite.workdir) # redirect stderr #sys.stderr = open("%s/runJobHPCEvent.stderr" % (self.__jobSite.workdir), "w") tolog("Current job workdir is: %s" % os.getcwd()) tolog("Site workdir is: %s" % self.__jobSite.workdir) # get the experiment object self.__thisExperiment = getExperiment(self.getExperiment()) tolog("runEvent will serve experiment: %s" % (self.__thisExperiment.getExperiment())) def getHPCEventJobFromPanda(self): pass def getHPCEventJobFromEnv(self): tolog("getHPCEventJobFromEnv") try: # always use this filename as the new jobDef module name import newJobDef job = Job.Job() job.setJobDef(newJobDef.job) job.coreCount = 0 job.workdir = self.__jobSite.workdir job.experiment = self.getExperiment() # figure out and set payload file names job.setPayloadName(self.__thisExperiment.getPayloadName(job)) # reset the default job output file list which is anyway not correct job.outFiles = [] except Exception, e: pilotErrorDiag = "Failed to process job info: %s" % str(e) tolog("!!WARNING!!3000!! %s" % (pilotErrorDiag)) self.failJob(0, PilotErrors.ERR_UNKNOWN, job, pilotErrorDiag=pilotErrorDiag) self.__job = job # prepare for the output file data directory # (will only created for jobs that end up in a 'holding' state) self.__job.datadir = self.getParentWorkDir() + "/PandaJob_%s_data" % ( job.jobId) # See if it's an analysis job or not trf = self.__job.trf self.__analysisJob = isAnalysisJob(trf.split(",")[0]) # Setup starts here ................................................................................ # Update the job state file self.__job.jobState = "starting" self.__job.setHpcStatus('init') # Send [especially] the process group back to the pilot self.__job.setState([self.__job.jobState, 0, 0]) self.__job.jobState = self.__job.result rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), runJob.getPilotPort()) self.__JR = JobRecovery(pshttpurl='https://pandaserver.cern.ch', pilot_initdir=self.__job.workdir) self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test") self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) # prepare the setup and get the run command list ec, runCommandList, job, multi_trf = self.setup( self.__job, self.__jobSite, self.__thisExperiment) if ec != 0: tolog("!!WARNING!!2999!! runJob setup failed: %s" % (job.pilotErrorDiag)) self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) tolog("Setup has finished successfully") self.__job = job self.__runCommandList = runCommandList self.__multi_trf = multi_trf # job has been updated, display it again self.__job.displayJob() tolog("RunCommandList: %s" % self.__runCommandList) tolog("Multi_trf: %s" % self.__multi_trf)
def finishJob(self): try: self.__hpcManager.finishJob() except: tolog(sys.exc_info()[1]) tolog(sys.exc_info()[2]) # If payload leaves the input files, delete them explicitly if self.__job.inFiles: ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles) #if self.__output_es_files: # ec = pUtil.removeFiles("/", self.__output_es_files) errorCode = PilotErrors.ERR_UNKNOWN if self.__job.attemptNr < 4: errorCode = PilotErrors.ERR_ESRECOVERABLE #check HPC job status #if self.__hpcStatus: # self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed") if len(self.__eventRanges) == 0: tolog("Cannot get event ranges") self.failJob(0, errorCode, self.__job, pilotErrorDiag="Cannot get event ranges") # check whether all event ranges are handled tolog("Total event ranges: %s" % len(self.__eventRanges)) not_handled_events = self.__eventRanges.values().count('new') tolog("Not handled events: %s" % not_handled_events) done_events = self.__eventRanges.values().count('Done') tolog("Finished events: %s" % done_events) stagedOut_events = self.__eventRanges.values().count('stagedOut') tolog("stagedOut but not updated to panda server events: %s" % stagedOut_events) if done_events + stagedOut_events: errorCode = PilotErrors.ERR_ESRECOVERABLE if not_handled_events + stagedOut_events: tolog("Not all event ranges are handled. failed job") self.failJob(0, errorCode, self.__job, pilotErrorDiag="Not All events are handled(total:%s, left:%s)" % (len(self.__eventRanges), not_handled_events + stagedOut_events)) dsname, datasetDict = self.getDatasets() tolog("dsname = %s" % (dsname)) tolog("datasetDict = %s" % (datasetDict)) # Create the output file dictionary needed for generating the metadata ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(self.__job.outFiles, self.__job.logFile, self.__job.workdir, fullpath=True) if ec: # missing output file (only error code from prepareOutFiles) self.failJob(self.__job.result[1], ec, self.__job, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet) ec, job, outputFileInfo = self.createFileMetadata([], self.__job, outsDict, dsname, datasetDict, self.__jobSite.sitename) if ec: self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) # Rename the metadata produced by the payload # if not pUtil.isBuildJob(outs): self.moveTrfMetadata(self.__job.workdir, self.__job.jobId) # Check the job report for any exit code that should replace the res_tuple[0] res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir) res = (res0, exitMsg, exitMsg) # Payload error handling ed = ErrorDiagnosis() job = ed.interpretPayload(self.__job, res, False, 0, self.__runCommandList, self.getFailureCode()) if job.result[1] != 0 or job.result[2] != 0: self.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag) self.__job = job job.jobState = "finished" job.setState([job.jobState, 0, 0]) job.jobState = job.result rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort(), final=True) tolog("Done") self.sysExit(self.__job)
def runHPCEvent(self): tolog("runHPCEvent") self.__job.jobState = "running" self.__job.setState([self.__job.jobState, 0, 0]) self.__job.pilotErrorDiag = None rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test") defRes = self.getDefaultResources() if defRes['copy_input_files'] == 'true': self.__copyInputFiles = True else: self.__copyInputFiles = False status, output, hpcJob = self.prepareHPCJob() if status == 0: tolog("HPC Job: %s " % hpcJob) else: tolog("failed to create the Tag file") self.failJob(0, PilotErrors.ERR_UNKNOWN, self.__job, pilotErrorDiag=output) return self.__hpcStatus = None self.__hpcLog = None logFileName = None tolog("runJobHPCEvent.getPilotLogFilename=%s"% self.getPilotLogFilename()) if self.getPilotLogFilename() != "": logFileName = self.getPilotLogFilename() hpcManager = HPCManager(globalWorkingDir=self.__job.workdir, logFileName=logFileName, poolFileCatalog=self.__poolFileCatalogTemp, inputFiles=self.__inputFilesGlobal, copyInputFiles=self.__copyInputFiles) self.__hpcManager = hpcManager self.HPCMode = "HPC_" + hpcManager.getMode(defRes) self.__job.setMode(self.HPCMode) self.__job.setHpcStatus('waitingResource') rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) hpcManager.getFreeResources(defRes) self.__job.coreCount = hpcManager.getCoreCount() self.__job.setHpcStatus('gettingEvents') rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) numRanges = hpcManager.getEventsNumber() tolog("HPC Manager needs events: %s, max_events: %s; use the smallest one." % (numRanges, defRes['max_events'])) if numRanges > int(defRes['max_events']): numRanges = int(defRes['max_events']) eventRanges = self.getEventRanges(numRanges=numRanges) #tolog("Event Ranges: %s " % eventRanges) if len(eventRanges) == 0: tolog("Get no Event ranges. return") return for eventRange in eventRanges: self.__eventRanges[eventRange['eventRangeID']] = 'new' # setup stage out self.setupStageOutHPCEvent() hpcManager.initJob(hpcJob) hpcManager.initEventRanges(eventRanges) hpcManager.submit() threadpool = ThreadPool(defRes['stageout_threads']) old_state = None time_start = time.time() while not hpcManager.isFinished(): state = hpcManager.poll() self.__job.setHpcStatus(state) if old_state is None or old_state != state or time.time() > (time_start + 60*10): old_state = state time_start = time.time() tolog("HPCManager Job stat: %s" % state) self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test") rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) if state and state == 'Complete': break outputs = hpcManager.getOutputs() for output in outputs: #self.stageOutHPCEvent(output) threadpool.add_task(self.stageOutHPCEvent, output) time.sleep(30) self.updateHPCEventRanges() tolog("HPCManager Job Finished") self.__job.setHpcStatus('stagingOut') rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) outputs = hpcManager.getOutputs() for output in outputs: #self.stageOutHPCEvent(output) threadpool.add_task(self.stageOutHPCEvent, output) self.updateHPCEventRanges() threadpool.wait_completion() self.updateHPCEventRanges() if len(self.__failedStageOuts) > 0: tolog("HPC Stage out retry 1") half_stageout_threads = defRes['stageout_threads'] / 2 if half_stageout_threads < 1: half_stageout_threads = 1 threadpool = ThreadPool(half_stageout_threads) failedStageOuts = self.__failedStageOuts self.__failedStageOuts = [] for failedStageOut in failedStageOuts: threadpool.add_task(self.stageOutHPCEvent, failedStageOut) threadpool.wait_completion() self.updateHPCEventRanges() if len(self.__failedStageOuts) > 0: tolog("HPC Stage out retry 2") threadpool = ThreadPool(1) failedStageOuts = self.__failedStageOuts self.__failedStageOuts = [] for failedStageOut in failedStageOuts: threadpool.add_task(self.stageOutHPCEvent, failedStageOut) threadpool.wait_completion() self.updateHPCEventRanges() self.__job.setHpcStatus('finished') self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) self.__hpcStatus, self.__hpcLog = hpcManager.checkHPCJobLog() tolog("HPC job log status: %s, job log error: %s" % (self.__hpcStatus, self.__hpcLog))
def executePayload(self, thisExperiment, job): t0 = os.times() res_tuple = None # loop over all run commands (only >1 for multi-trfs) getstatusoutput_was_interrupted = False job_status = None tolog("About to launch ARGO job") # Poll MQ for Job Status try: # Initiate MQ interface and send job self.argo_job.job_status_routing_key = '%s_job_status' % job.jobId #'status_' + jobID si = SiteInformation() mi = MessageInterface() mi.host = 'atlasgridftp02.hep.anl.gov' mi.port = 5671 mi.ssl_cert = si.getSSLCertificate( ) #'/grid/atlas/hpc/pilot_certs/xrootdsrv-cert.pem' proxy_cert_path = si.getSSLCertificate() mi.ssl_cert = os.path.dirname( proxy_cert_path) + "/rabbitmq-cert.pem" if 'X509_USER_CERT' in os.environ.keys(): mi.ssl_cert = os.environ[ 'X509_USER_CERT'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-cert.pem' mi.ssl_key = mi.ssl_cert #'/grid/atlas/hpc/pilot_certs/xrootdsrv-key.pem' mi.ssl_key = os.path.dirname(proxy_cert_path) + "/rabbitmq-key.pem" if 'X509_USER_KEY' in os.environ.keys(): mi.ssl_key = os.environ[ 'X509_USER_KEY'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-key.pem' #mi.ssl_ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem" mi.ssl_ca_certs = '/grid/atlas/hpc/pilot_certs/cacerts.pem' #if 'X509_CA_CERTS' in os.environ.keys(): # mi.ssl_ca_certs = os.environ['X509_CA_CERTS'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/cacerts.pem' #tolog("CA certs: %s" % (mi.ssl_ca_certs)) ca_certs = os.path.dirname( proxy_cert_path) + "/rabbitmq-cacerts.pem" if os.path.isfile(ca_certs): mi.ssl_ca_certs = ca_certs mi.exchange_name = 'argo_users' #Create queue to get messages about ARGO Job status from MQ tolog('Opening connection with MQ') mi.open_blocking_connection() tolog('Create queue [%s] to retrieve messages with job status' % self.argo_job.job_status_routing_key) mi.create_queue(self.argo_job.job_status_routing_key, self.argo_job.job_status_routing_key) # submit ARGO job to MQ #tolog('Opening connection with MQ') #mi.open_blocking_connection() routing_key = 'argo_job' if self.dev: routing_key = 'argo_job_dev' tolog('Sending msg with job to ARGO') mi.send_msg(self.argo_job.serialize(), routing_key) tolog(' done sending ') # Waiting till job done or failed ARGO_err_msg = '' while True: time.sleep(5) message = mi.receive_msg(self.argo_job.job_status_routing_key, True) if message[2]: tolog( "Got message from queue [%s]: method [%s], properties [%s], body [ %s ]" % (self.argo_job.job_status_routing_key, message[0], message[1], message[2])) job_status = ArgoJobStatus.get_from_message(message[2]) job.hpcStatus = job_status.state rt = RunJobUtilities.updatePilotServer( job, self.getPilotServer(), self.getPilotPort()) tolog("Extracted state: %s" % job_status.state) if job_status.state == job_status.HISTORY: res_tuple = (0, "Done") break elif job_status.is_failed(): res_tuple = (1, "Failed") ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message elif job_status.state == job_status.FAILED: res_tuple = (1, "Failed") ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message runJob.failJob(1, 0, job, ins=job.inFiles, pilotErrorDiag=ARGO_err_msg) break time.sleep(5) mi.close() tolog(' closing connection to MQ') tolog("Job State: %s" % (job_status.state)) #job.timeExe = int(fork_job.finished - fork_job.started) #################################################### except Exception, e: tolog("!!FAILED!!3000!! Failed to run command %s" % str(e)) getstatusoutput_was_interrupted = True res_tuple = (1, "Failed") self.failJob(0, self.__error.ERR_GENERALERROR, job, pilotErrorDiag=str(e))
if analysisJob: tolog("User analysis job") else: tolog("Production job") tolog("runJobArgo received a job with prodSourceLabel=%s" % (job.prodSourceLabel)) # setup starts here ................................................................................ # update the job state file job.jobState = "setup" #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # send [especially] the process group back to the pilot job.setState([job.jobState, 0, 0]) rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort()) # prepare the setup and get the run command list ec, job = runJob.setup(job, jobSite, thisExperiment) if ec != 0: tolog("!!WARNING!!2999!! runJob setup failed: %s" % (job.pilotErrorDiag)) runJob.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) tolog("Setup has finished successfully") # job has been updated, display it again job.displayJob() # (setup ends here) ................................................................................ tolog("Setting stage-in state until all input files have been copied")
def finishJob(self): try: self.__hpcManager.finishJob() except: tolog(sys.exc_info()[1]) tolog(sys.exc_info()[2]) # If payload leaves the input files, delete them explicitly if self.__job.inFiles: ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles) #if self.__output_es_files: # ec = pUtil.removeFiles("/", self.__output_es_files) errorCode = PilotErrors.ERR_UNKNOWN if self.__job.attemptNr < 4: errorCode = PilotErrors.ERR_ESRECOVERABLE #check HPC job status #if self.__hpcStatus: # self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed") if len(self.__eventRanges) == 0: tolog("Cannot get event ranges") self.failJob(0, errorCode, self.__job, pilotErrorDiag="Cannot get event ranges") # check whether all event ranges are handled tolog("Total event ranges: %s" % len(self.__eventRanges)) not_handled_events = self.__eventRanges.values().count('new') tolog("Not handled events: %s" % not_handled_events) done_events = self.__eventRanges.values().count('Done') tolog("Finished events: %s" % done_events) stagedOut_events = self.__eventRanges.values().count('stagedOut') tolog("stagedOut but not updated to panda server events: %s" % stagedOut_events) if done_events + stagedOut_events: errorCode = PilotErrors.ERR_ESRECOVERABLE if not_handled_events + stagedOut_events: tolog("Not all event ranges are handled. failed job") self.failJob( 0, errorCode, self.__job, pilotErrorDiag="Not All events are handled(total:%s, left:%s)" % (len(self.__eventRanges), not_handled_events + stagedOut_events)) dsname, datasetDict = self.getDatasets() tolog("dsname = %s" % (dsname)) tolog("datasetDict = %s" % (datasetDict)) # Create the output file dictionary needed for generating the metadata ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles( self.__job.outFiles, self.__job.logFile, self.__job.workdir, fullpath=True) if ec: # missing output file (only error code from prepareOutFiles) self.failJob(self.__job.result[1], ec, self.__job, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet) ec, job, outputFileInfo = self.createFileMetadata( [], self.__job, outsDict, dsname, datasetDict, self.__jobSite.sitename) if ec: self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) # Rename the metadata produced by the payload # if not pUtil.isBuildJob(outs): self.moveTrfMetadata(self.__job.workdir, self.__job.jobId) # Check the job report for any exit code that should replace the res_tuple[0] res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir) res = (res0, exitMsg, exitMsg) # Payload error handling ed = ErrorDiagnosis() job = ed.interpretPayload(self.__job, res, False, 0, self.__runCommandList, self.getFailureCode()) if job.result[1] != 0 or job.result[2] != 0: self.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag) self.__job = job job.jobState = "finished" job.setState([job.jobState, 0, 0]) job.jobState = job.result rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort(), final=True) tolog("Done") self.sysExit(self.__job)
def runHPCEvent(self): tolog("runHPCEvent") self.__job.jobState = "running" self.__job.setState([self.__job.jobState, 0, 0]) self.__job.pilotErrorDiag = None rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test") defRes = self.getDefaultResources() if defRes['copy_input_files'] == 'true': self.__copyInputFiles = True else: self.__copyInputFiles = False status, output, hpcJob = self.prepareHPCJob() if status == 0: tolog("HPC Job: %s " % hpcJob) else: tolog("failed to create the Tag file") self.failJob(0, PilotErrors.ERR_UNKNOWN, self.__job, pilotErrorDiag=output) return self.__hpcStatus = None self.__hpcLog = None logFileName = None tolog("runJobHPCEvent.getPilotLogFilename=%s" % self.getPilotLogFilename()) if self.getPilotLogFilename() != "": logFileName = self.getPilotLogFilename() hpcManager = HPCManager(globalWorkingDir=self.__job.workdir, logFileName=logFileName, poolFileCatalog=self.__poolFileCatalogTemp, inputFiles=self.__inputFilesGlobal, copyInputFiles=self.__copyInputFiles) self.__hpcManager = hpcManager self.HPCMode = "HPC_" + hpcManager.getMode(defRes) self.__job.setMode(self.HPCMode) self.__job.setHpcStatus('waitingResource') rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) hpcManager.getFreeResources(defRes) self.__job.coreCount = hpcManager.getCoreCount() self.__job.setHpcStatus('gettingEvents') rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) numRanges = hpcManager.getEventsNumber() tolog( "HPC Manager needs events: %s, max_events: %s; use the smallest one." % (numRanges, defRes['max_events'])) if numRanges > int(defRes['max_events']): numRanges = int(defRes['max_events']) eventRanges = self.getEventRanges(numRanges=numRanges) #tolog("Event Ranges: %s " % eventRanges) if len(eventRanges) == 0: tolog("Get no Event ranges. return") return for eventRange in eventRanges: self.__eventRanges[eventRange['eventRangeID']] = 'new' # setup stage out self.setupStageOutHPCEvent() hpcManager.initJob(hpcJob) hpcManager.initEventRanges(eventRanges) hpcManager.submit() threadpool = ThreadPool(defRes['stageout_threads']) old_state = None time_start = time.time() while not hpcManager.isFinished(): state = hpcManager.poll() self.__job.setHpcStatus(state) if old_state is None or old_state != state or time.time() > ( time_start + 60 * 10): old_state = state time_start = time.time() tolog("HPCManager Job stat: %s" % state) self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test") rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) if state and state == 'Complete': break outputs = hpcManager.getOutputs() for output in outputs: #self.stageOutHPCEvent(output) threadpool.add_task(self.stageOutHPCEvent, output) time.sleep(30) self.updateHPCEventRanges() tolog("HPCManager Job Finished") self.__job.setHpcStatus('stagingOut') rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort()) self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) outputs = hpcManager.getOutputs() for output in outputs: #self.stageOutHPCEvent(output) threadpool.add_task(self.stageOutHPCEvent, output) self.updateHPCEventRanges() threadpool.wait_completion() self.updateHPCEventRanges() if len(self.__failedStageOuts) > 0: tolog("HPC Stage out retry 1") half_stageout_threads = defRes['stageout_threads'] / 2 if half_stageout_threads < 1: half_stageout_threads = 1 threadpool = ThreadPool(half_stageout_threads) failedStageOuts = self.__failedStageOuts self.__failedStageOuts = [] for failedStageOut in failedStageOuts: threadpool.add_task(self.stageOutHPCEvent, failedStageOut) threadpool.wait_completion() self.updateHPCEventRanges() if len(self.__failedStageOuts) > 0: tolog("HPC Stage out retry 2") threadpool = ThreadPool(1) failedStageOuts = self.__failedStageOuts self.__failedStageOuts = [] for failedStageOut in failedStageOuts: threadpool.add_task(self.stageOutHPCEvent, failedStageOut) threadpool.wait_completion() self.updateHPCEventRanges() self.__job.setHpcStatus('finished') self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) self.__hpcStatus, self.__hpcLog = hpcManager.checkHPCJobLog() tolog("HPC job log status: %s, job log error: %s" % (self.__hpcStatus, self.__hpcLog))
if analysisJob: tolog("User analysis job") else: tolog("Production job") tolog("runJob received a job with prodSourceLabel=%s" % (job.prodSourceLabel)) # setup starts here ................................................................................ # update the job state file job.jobState = "setup" #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # send [especially] the process group back to the pilot job.setState([job.jobState, 0, 0]) rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort()) # prepare the setup and get the run command list ec, runCommandList, job, multi_trf = runJob.setup( job, jobSite, thisExperiment) if ec != 0: tolog("!!WARNING!!2999!! runJob setup failed: %s" % (job.pilotErrorDiag)) runJob.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) tolog("Setup has finished successfully") # job has been updated, display it again job.displayJob() # (setup ends here) ................................................................................
analysisJob = isAnalysisJob(job.trf.split(",")[0]) if analysisJob: tolog("User analysis job") else: tolog("Production job") tolog("runJob received a job with prodSourceLabel=%s" % (job.prodSourceLabel)) # setup starts here ................................................................................ # update the job state file job.jobState = "setup" #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # send [especially] the process group back to the pilot job.setState([job.jobState, 0, 0]) rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort()) # prepare the setup and get the run command list ec, runCommandList, job, multi_trf = runJob.setup(job, jobSite, thisExperiment) if ec != 0: tolog("!!WARNING!!2999!! runJob setup failed: %s" % (job.pilotErrorDiag)) runJob.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) tolog("Setup has finished successfully") # job has been updated, display it again job.displayJob() # (setup ends here) ................................................................................ tolog("Setting stage-in state until all input files have been copied") job.setState(["stagein", 0, 0])
analysisJob = isAnalysisJob(job.trf.split(",")[0]) if analysisJob: tolog("User analysis job") else: tolog("Production job") tolog("runJobArgo received a job with prodSourceLabel=%s" % (job.prodSourceLabel)) # setup starts here ................................................................................ # update the job state file job.jobState = "setup" #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # send [especially] the process group back to the pilot job.setState([job.jobState, 0, 0]) rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort()) # prepare the setup and get the run command list ec, job = runJob.setup(job, jobSite, thisExperiment) if ec != 0: tolog("!!WARNING!!2999!! runJob setup failed: %s" % (job.pilotErrorDiag)) runJob.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) tolog("Setup has finished successfully") # job has been updated, display it again job.displayJob() # (setup ends here) ................................................................................ tolog("Setting stage-in state until all input files have been copied") job.setState(["stagein", 0, 0])