def workspec2arcjob(workspec): '''Convert WorkSpec.workAttributes to arc.Job object''' job = arc.Job() try: wsattrs = workspec.workAttributes['arcjob'] proxyrole = workspec.workAttributes['proxyrole'] except: # Job was not submitted yet return (job, arc.Time(), None) for attr in dir(job): if attr not in wsattrs or attr == 'CreationTime': continue attrtype = type(getattr(job, attr)) # Some object types need special treatment if attrtype == arc.StringList: strlist = arc.StringList() for item in wsattrs[attr].split('|'): strlist.append(str(item)) setattr(job, attr, strlist) elif attrtype == arc.StringStringMap: ssm = arc.StringStringMap() for (k, v) in json.loads(wsattrs[attr]).items(): ssm[str(k)] = str(v) setattr(job, attr, ssm) else: setattr(job, attr, attrtype(str(wsattrs[attr]))) return (job, arc.Time(str(wsattrs['ModificationTime'])), proxyrole)
def getJobStatus(self, jobIDList): """ Get the status information for the given list of jobs """ result = self._prepareProxy() self.usercfg.ProxyPath(os.environ['X509_USER_PROXY']) if not result['OK']: gLogger.error('ARCComputingElement: failed to set up proxy', result['Message']) return result jobTmpList = list(jobIDList) if isinstance(jobIDList, basestring): jobTmpList = [jobIDList] # Pilots are stored with a DIRAC stamp (":::XXXXX") appended jobList = [] for j in jobTmpList: if ":::" in j: job = j.split(":::")[0] else: job = j jobList.append(job) resultDict = {} for jobID in jobList: gLogger.debug("Retrieving status for job %s" % jobID) job = self.__getARCJob(jobID) job.Update() arcState = job.State.GetGeneralState() gLogger.debug("ARC status for job %s is %s" % (jobID, arcState)) if arcState: # Meaning arcState is filled. Is this good python? resultDict[jobID] = self.mapStates[arcState] # Renew proxy only of jobs which are running or queuing if arcState in ("Running", "Queuing"): nearExpiry = arc.Time() + arc.Period( 10000) # 2 hours, 46 minutes and 40 seconds if job.ProxyExpirationTime < nearExpiry: job.Renew() gLogger.debug( "Renewing proxy for job %s whose proxy expires at %s" % (jobID, job.ProxyExpirationTime)) if arcState == "Hold": # Cancel held jobs so they don't sit in the queue forever gLogger.debug("Killing held job %s" % jobID) job.Cancel() else: resultDict[jobID] = 'Unknown' # If done - is it really done? Check the exit code if resultDict[jobID] == "Done": exitCode = int(job.ExitCode) if exitCode: resultDict[jobID] = "Failed" gLogger.debug("DIRAC status for job %s is %s" % (jobID, resultDict[jobID])) if not resultDict: return S_ERROR('No job statuses returned') return S_OK(resultDict)
def arcjob2workspec(arcjob, workspec): '''Fill WorkSpec workAttributes with ARC job attributes''' jobattrs = {} for attr in dir(arcjob): # Don't store internal python attrs or job description if re.match('^__', attr) or attr == 'JobDescriptionDocument': continue attrtype = type(getattr(arcjob, attr)) if attrtype == int or attrtype == str: jobattrs[attr] = getattr(arcjob, attr) elif attrtype == arc.JobState: jobattrs[attr] = getattr(arcjob, attr).GetGeneralState() elif attrtype == arc.StringList: jobattrs[attr] = '|'.join(getattr(arcjob, attr)) elif attrtype == arc.URL: jobattrs[attr] = getattr(arcjob, attr).str().replace(r'\2f', r'/') elif attrtype == arc.StringStringMap: ssm = getattr(arcjob, attr) tmpdict = dict(zip(ssm.keys(), ssm.values())) jobattrs[attr] = json.dumps(tmpdict) elif attrtype == arc.Period: jobattrs[attr] = getattr(arcjob, attr).GetPeriod() elif attrtype == arc.Time: if getattr(arcjob, attr).GetTime() != -1: jobattrs[attr] = getattr(arcjob, attr).str(arc.UTCTime) # Other attributes of complex types are not stored # Set update time jobattrs['ModificationTime'] = arc.Time().str(arc.UTCTime) if workspec.workAttributes: workspec.workAttributes['arcjob'] = jobattrs else: workspec.workAttributes = {'arcjob': jobattrs}
def kill_worker(self, workspec): """Cancel the ARC job. :param workspec: worker specification :type workspec: WorkSpec :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ # make logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec) if not job.JobID: # Job not submitted tmplog.info("Job was not submitted so cannot be cancelled") return True, '' # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) except: # Log a warning and return True so that job can be cleaned tmplog.warning("Job {0}: no proxy found with role {1}".format( job.JobID, proxyrole)) return True, '' job_supervisor = arc.JobSupervisor(userconfig, [job]) job_supervisor.Update() job_supervisor.Cancel() notcancelled = job_supervisor.GetIDsNotProcessed() if job.JobID in notcancelled: if job.State == arc.JobState.UNDEFINED: # If longer than one hour since submission assume job never made it if job.SubmissionTime + arc.Period(3600) < arc.Time(): tmplog.warning( "Assuming job is lost and marking as cancelled") return True, '' # Job has not yet reached info system tmplog.warning( "Job is not yet in info system so cannot be cancelled") return False, "Job is not yet in info system so could not be cancelled" # Log a warning and return True so that job can be cleaned tmplog.warning("Job could not be cancelled") return True, '' tmplog.info("Job cancelled successfully") return True, ''
def checkJobs(self): ''' Query all running jobs ''' # minimum time between checks if time.time() < self.checktime + int( self.conf.get(['jobs', 'checkmintime'])): self.log.debug("mininterval not reached") return self.checktime = time.time() # check jobs which were last checked more than checkinterval ago jobstocheck=self.db.getArcJobs("arcstate in ('submitted', 'running', 'finishing', 'cancelling', 'holding') and " \ "jobid not like '' and cluster='"+self.cluster+"' and "+ \ self.db.timeStampLessThan("tarcstate", self.conf.get(['jobs','checkinterval'])) + \ " limit 100000") njobstocheck = sum(len(v) for v in jobstocheck.values()) if not njobstocheck: return self.log.info("%d jobs to check" % njobstocheck) self.resetJobs(jobstocheck) # Loop over proxies for proxyid, jobs in jobstocheck.items(): self.uc.CredentialString(str(self.db.getProxy(proxyid))) job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs]) job_supervisor.Update() jobsupdated = job_supervisor.GetAllJobs() jobsnotupdated = job_supervisor.GetIDsNotProcessed() for (originaljobinfo, updatedjob) in zip(jobs, jobsupdated): (id, appjobid, originaljob, created) = originaljobinfo if updatedjob.JobID in jobsnotupdated: self.log.error("%s: Failed to find information on %s" % (appjobid, updatedjob.JobID)) continue if updatedjob.JobID != originaljob.JobID: # something went wrong with list order self.log.warning( "%s: Bad job id (%s), expected %s" % (appjobid, updatedjob.JobID, originaljob.JobID)) continue # compare strings here to get around limitations of JobState API # map INLRMS:S and O to HOLD (not necessary when ARC 4.1 is used) if updatedjob.State.GetGeneralState() == 'Queuing' and ( updatedjob.State.GetSpecificState() == 'INLRMS:S' or updatedjob.State.GetSpecificState() == 'INLRMS:O'): updatedjob.State = arc.JobState('Hold') if originaljob.State.GetGeneralState() == updatedjob.State.GetGeneralState() \ and self.cluster not in ['gsiftp://gar-ex-etpgrid1.garching.physik.uni-muenchen.de:2811/preempt', 'gsiftp://arc1-it4i.farm.particle.cz/qfree', 'gsiftp://arc2-it4i.farm.particle.cz/qfree']: # just update timestamp # Update numbers every time for superMUC since walltime is missing for finished jobs self.db.updateArcJob(id, {'tarcstate': self.db.getTimeStamp()}) continue self.log.info("%s: Job %s: %s -> %s (%s)" % (appjobid, originaljob.JobID, originaljob.State.GetGeneralState(), updatedjob.State.GetGeneralState(), updatedjob.State.GetSpecificState())) # state changed, update whole Job object arcstate = 'submitted' if updatedjob.State == arc.JobState.FINISHED: if updatedjob.ExitCode == -1: # Missing exit code, but assume success self.log.warning( "%s: Job %s FINISHED but has missing exit code, setting to zero" % (appjobid, updatedjob.JobID)) updatedjob.ExitCode = 0 arcstate = 'finished' self.log.debug( '%s: reported walltime %d, cputime %d' % (appjobid, updatedjob.UsedTotalWallTime.GetPeriod(), updatedjob.UsedTotalCPUTime.GetPeriod())) elif updatedjob.State == arc.JobState.FAILED: # EMI-ES reports cancelled jobs as failed so check substate (this is fixed in ARC 6.8) if 'cancel' in updatedjob.State.GetSpecificState(): arcstate = 'cancelled' else: arcstate = self.processJobErrors( id, appjobid, updatedjob) elif updatedjob.State == arc.JobState.KILLED: arcstate = 'cancelled' elif updatedjob.State == arc.JobState.RUNNING: arcstate = 'running' elif updatedjob.State == arc.JobState.FINISHING: arcstate = 'finishing' elif updatedjob.State == arc.JobState.HOLD: arcstate = 'holding' elif updatedjob.State == arc.JobState.DELETED or \ updatedjob.State == arc.JobState.OTHER: # unexpected arcstate = 'failed' # Walltime reported by ARC 6 is multiplied by cores if arc.ARC_VERSION_MAJOR >= 6 and updatedjob.RequestedSlots > 0: updatedjob.UsedTotalWallTime = arc.Period( updatedjob.UsedTotalWallTime.GetPeriod() // updatedjob.RequestedSlots) # Fix crazy wallclock and CPU times if updatedjob.UsedTotalWallTime > arc.Time() - arc.Time( int(created.strftime("%s"))): fixedwalltime = arc.Time() - arc.Time( int(created.strftime("%s"))) self.log.warning( "%s: Fixing reported walltime %d to %d" % (appjobid, updatedjob.UsedTotalWallTime.GetPeriod(), fixedwalltime.GetPeriod())) updatedjob.UsedTotalWallTime = fixedwalltime if updatedjob.UsedTotalCPUTime > arc.Period(10**7): self.log.warning( "%s: Discarding reported CPUtime %d" % (appjobid, updatedjob.UsedTotalCPUTime.GetPeriod())) updatedjob.UsedTotalCPUTime = arc.Period(-1) self.db.updateArcJob( id, { 'arcstate': arcstate, 'tarcstate': self.db.getTimeStamp(), 'tstate': self.db.getTimeStamp() }, updatedjob) self.log.info('Done')
def getJobStatus(self, jobIDList): """ Get the status information for the given list of jobs """ result = self._prepareProxy() if not result['OK']: self.log.error('ARCComputingElement: failed to set up proxy', result['Message']) return result self.usercfg.ProxyPath(os.environ['X509_USER_PROXY']) jobTmpList = list(jobIDList) if isinstance(jobIDList, six.string_types): jobTmpList = [jobIDList] # Pilots are stored with a DIRAC stamp (":::XXXXX") appended jobList = [] for j in jobTmpList: if ":::" in j: job = j.split(":::")[0] else: job = j jobList.append(job) jobs = [] for jobID in jobList: jobs.append(self.__getARCJob(jobID)) # JobSupervisor is able to aggregate jobs to perform bulk operations and thus minimizes the communication overhead # We still need to create chunks to avoid timeout in the case there are too many jobs to supervise jobsUpdated = [] for chunk in breakListIntoChunks(jobs, 100): job_supervisor = arc.JobSupervisor(self.usercfg, chunk) job_supervisor.Update() jobsUpdated.extend(job_supervisor.GetAllJobs()) resultDict = {} jobsToRenew = [] jobsToCancel = [] for job in jobsUpdated: jobID = job.JobID self.log.debug("Retrieving status for job %s" % jobID) arcState = job.State.GetGeneralState() self.log.debug("ARC status for job %s is %s" % (jobID, arcState)) if arcState: # Meaning arcState is filled. Is this good python? resultDict[jobID] = self.mapStates[arcState] # Renew proxy only of jobs which are running or queuing if arcState in ("Running", "Queuing"): nearExpiry = arc.Time() + arc.Period(10000) # 2 hours, 46 minutes and 40 seconds if job.ProxyExpirationTime < nearExpiry: # Jobs to renew are aggregated to perform bulk operations jobsToRenew.append(job) self.log.debug("Renewing proxy for job %s whose proxy expires at %s" % (jobID, job.ProxyExpirationTime)) if arcState == "Hold": # Jobs to cancel are aggregated to perform bulk operations # Cancel held jobs so they don't sit in the queue forever jobsToCancel.append(job) self.log.debug("Killing held job %s" % jobID) else: resultDict[jobID] = 'Unknown' # If done - is it really done? Check the exit code if resultDict[jobID] == "Done": exitCode = int(job.ExitCode) if exitCode: resultDict[jobID] = "Failed" self.log.debug("DIRAC status for job %s is %s" % (jobID, resultDict[jobID])) # JobSupervisor is able to aggregate jobs to perform bulk operations and thus minimizes the communication overhead # We still need to create chunks to avoid timeout in the case there are too many jobs to supervise for chunk in breakListIntoChunks(jobsToRenew, 100): job_supervisor_renew = arc.JobSupervisor(self.usercfg, chunk) if not job_supervisor_renew.Renew(): self.log.warn('At least one of the jobs failed to renew its credentials') for chunk in breakListIntoChunks(jobsToCancel, 100): job_supervisor_cancel = arc.JobSupervisor(self.usercfg, chunk) if not job_supervisor_cancel.Cancel(): self.log.warn('At least one of the jobs failed to be cancelled') if not resultDict: return S_ERROR('No job statuses returned') return S_OK(resultDict)
def processToCancel(self): jobstocancel = self.db.getArcJobs("arcstate='tocancel' and cluster='" + self.cluster + "'") if not jobstocancel: return self.log.info("Cancelling %i jobs" % sum(len(v) for v in jobstocancel.values())) for proxyid, jobs in jobstocancel.items(): self.uc.CredentialString(self.db.getProxy(proxyid)) job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs]) job_supervisor.Update() job_supervisor.Cancel() notcancelled = job_supervisor.GetIDsNotProcessed() for (id, appjobid, job, created) in jobs: if not job.JobID: # Job not submitted self.log.info("%s: Marking unsubmitted job cancelled" % appjobid) self.db.updateArcJob( id, { "arcstate": "cancelled", "tarcstate": self.db.getTimeStamp() }) elif job.JobID in notcancelled: if job.State == arc.JobState.UNDEFINED: # If longer than one hour since submission assume job never made it if job.StartTime + arc.Period(3600) < arc.Time(): self.log.warning( "%s: Assuming job %s is lost and marking as cancelled" % (appjobid, job.JobID)) self.db.updateArcJob( id, { "arcstate": "cancelled", "tarcstate": self.db.getTimeStamp() }) else: # Job has not yet reached info system self.log.warning( "%s: Job %s is not yet in info system so cannot be cancelled" % (appjobid, job.JobID)) else: self.log.error("%s: Could not cancel job %s" % (appjobid, job.JobID)) # Just to mark as cancelled so it can be cleaned self.db.updateArcJob( id, { "arcstate": "cancelled", "tarcstate": self.db.getTimeStamp() }) else: self.db.updateArcJob( id, { "arcstate": "cancelling", "tarcstate": self.db.getTimeStamp() })
def check_workers(self, workspec_list): retList = [] for workspec in workspec_list: # make logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log tmplog.info("checking worker id {0}".format(workspec.workerID)) (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec) # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) except: tmplog.error("Job {0}: no proxy found with role {1}".format( job.JobID, proxyrole)) retList.append((workspec.status, '')) continue job_supervisor = arc.JobSupervisor(userconfig, [job]) job_supervisor.Update() jobsupdated = job_supervisor.GetAllJobs() jobsnotupdated = job_supervisor.GetIDsNotProcessed() for updatedjob in jobsupdated: if updatedjob.JobID in jobsnotupdated: tmplog.error("Failed to find information on {0}".format( updatedjob.JobID)) # If missing for too long (2 days), mark as lost if arc.Time() - modtime > arc.Period(172800): tmplog.error( "Job {0} missing for more than 2 days, marking as lost" .format(updatedjob.JobID)) retList.append((workspec.ST_failed, '')) else: retList.append((workspec.status, '')) continue # Convert arc state to WorkSpec state arcstatus = updatedjob.State newstatus = WorkSpec.ST_submitted if arcstatus == arc.JobState.RUNNING or \ arcstatus == arc.JobState.FINISHING: newstatus = WorkSpec.ST_running elif arcstatus == arc.JobState.FINISHED: if updatedjob.ExitCode == -1: # Missing exit code, but assume success tmplog.warning( "Job {0} FINISHED but has missing exit code, setting to zero" .format(updatedjob.JobID)) updatedjob.ExitCode = 0 newstatus = WorkSpec.ST_finished elif arcstatus == arc.JobState.FAILED: newstatus = WorkSpec.ST_failed tmplog.info("Job {0} failed: {1}".format( updatedjob.JobID, ";".join([joberr for joberr in updatedjob.Error]))) elif arcstatus == arc.JobState.KILLED: newstatus = WorkSpec.ST_cancelled elif arcstatus == arc.JobState.DELETED or \ arcstatus == arc.JobState.OTHER: # unexpected newstatus = WorkSpec.ST_failed # Not covered: arc.JobState.HOLD. Maybe need a post-run state in # harvester, also to cover FINISHING # compare strings here to get around limitations of JobState API if job.State.GetGeneralState( ) == updatedjob.State.GetGeneralState(): tmplog.debug("Job {0} still in state {1}".format( job.JobID, job.State.GetGeneralState())) retList.append((newstatus, '')) continue tmplog.info("Job {0}: {1} -> {2} ({3})".format( job.JobID, job.State.GetGeneralState(), updatedjob.State.GetGeneralState(), updatedjob.State.GetSpecificState())) arc_utils.arcjob2workspec(updatedjob, workspec) # Have to force update to change info in DB workspec.force_update('workAttributes') tmplog.debug("batchStatus {0} -> workerStatus {1}".format( arcstatus.GetGeneralState(), newstatus)) retList.append((newstatus, '')) return True, retList