Beispiel #1
0
def workspec2arcjob(workspec):
    '''Convert WorkSpec.workAttributes to arc.Job object'''

    job = arc.Job()
    try:
        wsattrs = workspec.workAttributes['arcjob']
        proxyrole = workspec.workAttributes['proxyrole']
    except:
        # Job was not submitted yet
        return (job, arc.Time(), None)

    for attr in dir(job):
        if attr not in wsattrs or attr == 'CreationTime':
            continue

        attrtype = type(getattr(job, attr))
        # Some object types need special treatment
        if attrtype == arc.StringList:
            strlist = arc.StringList()
            for item in wsattrs[attr].split('|'):
                strlist.append(str(item))
            setattr(job, attr, strlist)
        elif attrtype == arc.StringStringMap:
            ssm = arc.StringStringMap()
            for (k, v) in json.loads(wsattrs[attr]).items():
                ssm[str(k)] = str(v)
            setattr(job, attr, ssm)
        else:
            setattr(job, attr, attrtype(str(wsattrs[attr])))
    return (job, arc.Time(str(wsattrs['ModificationTime'])), proxyrole)
Beispiel #2
0
    def getJobStatus(self, jobIDList):
        """ Get the status information for the given list of jobs
    """

        result = self._prepareProxy()
        self.usercfg.ProxyPath(os.environ['X509_USER_PROXY'])
        if not result['OK']:
            gLogger.error('ARCComputingElement: failed to set up proxy',
                          result['Message'])
            return result

        jobTmpList = list(jobIDList)
        if isinstance(jobIDList, basestring):
            jobTmpList = [jobIDList]

        # Pilots are stored with a DIRAC stamp (":::XXXXX") appended
        jobList = []
        for j in jobTmpList:
            if ":::" in j:
                job = j.split(":::")[0]
            else:
                job = j
            jobList.append(job)

        resultDict = {}
        for jobID in jobList:
            gLogger.debug("Retrieving status for job %s" % jobID)
            job = self.__getARCJob(jobID)
            job.Update()
            arcState = job.State.GetGeneralState()
            gLogger.debug("ARC status for job %s is %s" % (jobID, arcState))
            if arcState:  # Meaning arcState is filled. Is this good python?
                resultDict[jobID] = self.mapStates[arcState]
                # Renew proxy only of jobs which are running or queuing
                if arcState in ("Running", "Queuing"):
                    nearExpiry = arc.Time() + arc.Period(
                        10000)  # 2 hours, 46 minutes and 40 seconds
                    if job.ProxyExpirationTime < nearExpiry:
                        job.Renew()
                        gLogger.debug(
                            "Renewing proxy for job %s whose proxy expires at %s"
                            % (jobID, job.ProxyExpirationTime))
                if arcState == "Hold":
                    # Cancel held jobs so they don't sit in the queue forever
                    gLogger.debug("Killing held job %s" % jobID)
                    job.Cancel()
            else:
                resultDict[jobID] = 'Unknown'
            # If done - is it really done? Check the exit code
            if resultDict[jobID] == "Done":
                exitCode = int(job.ExitCode)
                if exitCode:
                    resultDict[jobID] = "Failed"
            gLogger.debug("DIRAC status for job %s is %s" %
                          (jobID, resultDict[jobID]))

        if not resultDict:
            return S_ERROR('No job statuses returned')

        return S_OK(resultDict)
Beispiel #3
0
def arcjob2workspec(arcjob, workspec):
    '''Fill WorkSpec workAttributes with ARC job attributes'''

    jobattrs = {}
    for attr in dir(arcjob):
        # Don't store internal python attrs or job description
        if re.match('^__', attr) or attr == 'JobDescriptionDocument':
            continue

        attrtype = type(getattr(arcjob, attr))
        if attrtype == int or attrtype == str:
            jobattrs[attr] = getattr(arcjob, attr)
        elif attrtype == arc.JobState:
            jobattrs[attr] = getattr(arcjob, attr).GetGeneralState()
        elif attrtype == arc.StringList:
            jobattrs[attr] = '|'.join(getattr(arcjob, attr))
        elif attrtype == arc.URL:
            jobattrs[attr] = getattr(arcjob, attr).str().replace(r'\2f', r'/')
        elif attrtype == arc.StringStringMap:
            ssm = getattr(arcjob, attr)
            tmpdict = dict(zip(ssm.keys(), ssm.values()))
            jobattrs[attr] = json.dumps(tmpdict)
        elif attrtype == arc.Period:
            jobattrs[attr] = getattr(arcjob, attr).GetPeriod()
        elif attrtype == arc.Time:
            if getattr(arcjob, attr).GetTime() != -1:
                jobattrs[attr] = getattr(arcjob, attr).str(arc.UTCTime)
        # Other attributes of complex types are not stored

    # Set update time
    jobattrs['ModificationTime'] = arc.Time().str(arc.UTCTime)
    if workspec.workAttributes:
        workspec.workAttributes['arcjob'] = jobattrs
    else:
        workspec.workAttributes = {'arcjob': jobattrs}
Beispiel #4
0
    def kill_worker(self, workspec):
        """Cancel the ARC job.

        :param workspec: worker specification
        :type workspec: WorkSpec
        :return: A tuple of return code (True for success, False otherwise) and error dialog
        :rtype: (bool, string)
        """

        # make logger
        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmplog = arclog.log

        (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec)
        if not job.JobID:
            # Job not submitted
            tmplog.info("Job was not submitted so cannot be cancelled")
            return True, ''

        # Set certificate
        userconfig = arc.UserConfig(self.cred_type)
        try:
            userconfig.ProxyPath(str(self.certs[proxyrole]))
        except:
            # Log a warning and return True so that job can be cleaned
            tmplog.warning("Job {0}: no proxy found with role {1}".format(
                job.JobID, proxyrole))
            return True, ''

        job_supervisor = arc.JobSupervisor(userconfig, [job])
        job_supervisor.Update()
        job_supervisor.Cancel()

        notcancelled = job_supervisor.GetIDsNotProcessed()

        if job.JobID in notcancelled:
            if job.State == arc.JobState.UNDEFINED:
                # If longer than one hour since submission assume job never made it
                if job.SubmissionTime + arc.Period(3600) < arc.Time():
                    tmplog.warning(
                        "Assuming job is lost and marking as cancelled")
                    return True, ''

                # Job has not yet reached info system
                tmplog.warning(
                    "Job is not yet in info system so cannot be cancelled")
                return False, "Job is not yet in info system so could not be cancelled"

            # Log a warning and return True so that job can be cleaned
            tmplog.warning("Job could not be cancelled")
            return True, ''

        tmplog.info("Job cancelled successfully")
        return True, ''
Beispiel #5
0
    def checkJobs(self):
        '''
        Query all running jobs
        '''

        # minimum time between checks
        if time.time() < self.checktime + int(
                self.conf.get(['jobs', 'checkmintime'])):
            self.log.debug("mininterval not reached")
            return
        self.checktime = time.time()

        # check jobs which were last checked more than checkinterval ago
        jobstocheck=self.db.getArcJobs("arcstate in ('submitted', 'running', 'finishing', 'cancelling', 'holding') and " \
                                       "jobid not like '' and cluster='"+self.cluster+"' and "+ \
                                       self.db.timeStampLessThan("tarcstate", self.conf.get(['jobs','checkinterval'])) + \
                                       " limit 100000")

        njobstocheck = sum(len(v) for v in jobstocheck.values())
        if not njobstocheck:
            return
        self.log.info("%d jobs to check" % njobstocheck)
        self.resetJobs(jobstocheck)

        # Loop over proxies
        for proxyid, jobs in jobstocheck.items():
            self.uc.CredentialString(str(self.db.getProxy(proxyid)))

            job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs])
            job_supervisor.Update()
            jobsupdated = job_supervisor.GetAllJobs()
            jobsnotupdated = job_supervisor.GetIDsNotProcessed()

            for (originaljobinfo, updatedjob) in zip(jobs, jobsupdated):
                (id, appjobid, originaljob, created) = originaljobinfo
                if updatedjob.JobID in jobsnotupdated:
                    self.log.error("%s: Failed to find information on %s" %
                                   (appjobid, updatedjob.JobID))
                    continue
                if updatedjob.JobID != originaljob.JobID:
                    # something went wrong with list order
                    self.log.warning(
                        "%s: Bad job id (%s), expected %s" %
                        (appjobid, updatedjob.JobID, originaljob.JobID))
                    continue
                # compare strings here to get around limitations of JobState API
                # map INLRMS:S and O to HOLD (not necessary when ARC 4.1 is used)
                if updatedjob.State.GetGeneralState() == 'Queuing' and (
                        updatedjob.State.GetSpecificState() == 'INLRMS:S'
                        or updatedjob.State.GetSpecificState() == 'INLRMS:O'):
                    updatedjob.State = arc.JobState('Hold')
                if originaljob.State.GetGeneralState() == updatedjob.State.GetGeneralState() \
                     and self.cluster not in ['gsiftp://gar-ex-etpgrid1.garching.physik.uni-muenchen.de:2811/preempt', 'gsiftp://arc1-it4i.farm.particle.cz/qfree', 'gsiftp://arc2-it4i.farm.particle.cz/qfree']:
                    # just update timestamp
                    # Update numbers every time for superMUC since walltime is missing for finished jobs
                    self.db.updateArcJob(id,
                                         {'tarcstate': self.db.getTimeStamp()})
                    continue

                self.log.info("%s: Job %s: %s -> %s (%s)" %
                              (appjobid, originaljob.JobID,
                               originaljob.State.GetGeneralState(),
                               updatedjob.State.GetGeneralState(),
                               updatedjob.State.GetSpecificState()))

                # state changed, update whole Job object
                arcstate = 'submitted'
                if updatedjob.State == arc.JobState.FINISHED:
                    if updatedjob.ExitCode == -1:
                        # Missing exit code, but assume success
                        self.log.warning(
                            "%s: Job %s FINISHED but has missing exit code, setting to zero"
                            % (appjobid, updatedjob.JobID))
                        updatedjob.ExitCode = 0
                    arcstate = 'finished'
                    self.log.debug(
                        '%s: reported walltime %d, cputime %d' %
                        (appjobid, updatedjob.UsedTotalWallTime.GetPeriod(),
                         updatedjob.UsedTotalCPUTime.GetPeriod()))
                elif updatedjob.State == arc.JobState.FAILED:
                    # EMI-ES reports cancelled jobs as failed so check substate (this is fixed in ARC 6.8)
                    if 'cancel' in updatedjob.State.GetSpecificState():
                        arcstate = 'cancelled'
                    else:
                        arcstate = self.processJobErrors(
                            id, appjobid, updatedjob)
                elif updatedjob.State == arc.JobState.KILLED:
                    arcstate = 'cancelled'
                elif updatedjob.State == arc.JobState.RUNNING:
                    arcstate = 'running'
                elif updatedjob.State == arc.JobState.FINISHING:
                    arcstate = 'finishing'
                elif updatedjob.State == arc.JobState.HOLD:
                    arcstate = 'holding'
                elif updatedjob.State == arc.JobState.DELETED or \
                     updatedjob.State == arc.JobState.OTHER:
                    # unexpected
                    arcstate = 'failed'

                # Walltime reported by ARC 6 is multiplied by cores
                if arc.ARC_VERSION_MAJOR >= 6 and updatedjob.RequestedSlots > 0:
                    updatedjob.UsedTotalWallTime = arc.Period(
                        updatedjob.UsedTotalWallTime.GetPeriod() //
                        updatedjob.RequestedSlots)
                # Fix crazy wallclock and CPU times
                if updatedjob.UsedTotalWallTime > arc.Time() - arc.Time(
                        int(created.strftime("%s"))):
                    fixedwalltime = arc.Time() - arc.Time(
                        int(created.strftime("%s")))
                    self.log.warning(
                        "%s: Fixing reported walltime %d to %d" %
                        (appjobid, updatedjob.UsedTotalWallTime.GetPeriod(),
                         fixedwalltime.GetPeriod()))
                    updatedjob.UsedTotalWallTime = fixedwalltime
                if updatedjob.UsedTotalCPUTime > arc.Period(10**7):
                    self.log.warning(
                        "%s: Discarding reported CPUtime %d" %
                        (appjobid, updatedjob.UsedTotalCPUTime.GetPeriod()))
                    updatedjob.UsedTotalCPUTime = arc.Period(-1)
                self.db.updateArcJob(
                    id, {
                        'arcstate': arcstate,
                        'tarcstate': self.db.getTimeStamp(),
                        'tstate': self.db.getTimeStamp()
                    }, updatedjob)

        self.log.info('Done')
Beispiel #6
0
  def getJobStatus(self, jobIDList):
    """ Get the status information for the given list of jobs
    """

    result = self._prepareProxy()
    if not result['OK']:
      self.log.error('ARCComputingElement: failed to set up proxy', result['Message'])
      return result
    self.usercfg.ProxyPath(os.environ['X509_USER_PROXY'])

    jobTmpList = list(jobIDList)
    if isinstance(jobIDList, six.string_types):
      jobTmpList = [jobIDList]

    # Pilots are stored with a DIRAC stamp (":::XXXXX") appended
    jobList = []
    for j in jobTmpList:
      if ":::" in j:
        job = j.split(":::")[0]
      else:
        job = j
      jobList.append(job)

    jobs = []
    for jobID in jobList:
      jobs.append(self.__getARCJob(jobID))

    # JobSupervisor is able to aggregate jobs to perform bulk operations and thus minimizes the communication overhead
    # We still need to create chunks to avoid timeout in the case there are too many jobs to supervise
    jobsUpdated = []
    for chunk in breakListIntoChunks(jobs, 100):
      job_supervisor = arc.JobSupervisor(self.usercfg, chunk)
      job_supervisor.Update()
      jobsUpdated.extend(job_supervisor.GetAllJobs())

    resultDict = {}
    jobsToRenew = []
    jobsToCancel = []
    for job in jobsUpdated:
      jobID = job.JobID
      self.log.debug("Retrieving status for job %s" % jobID)
      arcState = job.State.GetGeneralState()
      self.log.debug("ARC status for job %s is %s" % (jobID, arcState))
      if arcState:  # Meaning arcState is filled. Is this good python?
        resultDict[jobID] = self.mapStates[arcState]
        # Renew proxy only of jobs which are running or queuing
        if arcState in ("Running", "Queuing"):
          nearExpiry = arc.Time() + arc.Period(10000)  # 2 hours, 46 minutes and 40 seconds
          if job.ProxyExpirationTime < nearExpiry:
            # Jobs to renew are aggregated to perform bulk operations
            jobsToRenew.append(job)
            self.log.debug("Renewing proxy for job %s whose proxy expires at %s" % (jobID, job.ProxyExpirationTime))
        if arcState == "Hold":
          # Jobs to cancel are aggregated to perform bulk operations
          # Cancel held jobs so they don't sit in the queue forever
          jobsToCancel.append(job)
          self.log.debug("Killing held job %s" % jobID)
      else:
        resultDict[jobID] = 'Unknown'
      # If done - is it really done? Check the exit code
      if resultDict[jobID] == "Done":
        exitCode = int(job.ExitCode)
        if exitCode:
          resultDict[jobID] = "Failed"
      self.log.debug("DIRAC status for job %s is %s" % (jobID, resultDict[jobID]))

    # JobSupervisor is able to aggregate jobs to perform bulk operations and thus minimizes the communication overhead
    # We still need to create chunks to avoid timeout in the case there are too many jobs to supervise
    for chunk in breakListIntoChunks(jobsToRenew, 100):
      job_supervisor_renew = arc.JobSupervisor(self.usercfg, chunk)
      if not job_supervisor_renew.Renew():
        self.log.warn('At least one of the jobs failed to renew its credentials')

    for chunk in breakListIntoChunks(jobsToCancel, 100):
      job_supervisor_cancel = arc.JobSupervisor(self.usercfg, chunk)
      if not job_supervisor_cancel.Cancel():
        self.log.warn('At least one of the jobs failed to be cancelled')

    if not resultDict:
      return S_ERROR('No job statuses returned')

    return S_OK(resultDict)
Beispiel #7
0
    def processToCancel(self):

        jobstocancel = self.db.getArcJobs("arcstate='tocancel' and cluster='" +
                                          self.cluster + "'")
        if not jobstocancel:
            return

        self.log.info("Cancelling %i jobs" %
                      sum(len(v) for v in jobstocancel.values()))
        for proxyid, jobs in jobstocancel.items():
            self.uc.CredentialString(self.db.getProxy(proxyid))

            job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs])
            job_supervisor.Update()
            job_supervisor.Cancel()

            notcancelled = job_supervisor.GetIDsNotProcessed()

            for (id, appjobid, job, created) in jobs:

                if not job.JobID:
                    # Job not submitted
                    self.log.info("%s: Marking unsubmitted job cancelled" %
                                  appjobid)
                    self.db.updateArcJob(
                        id, {
                            "arcstate": "cancelled",
                            "tarcstate": self.db.getTimeStamp()
                        })

                elif job.JobID in notcancelled:
                    if job.State == arc.JobState.UNDEFINED:
                        # If longer than one hour since submission assume job never made it
                        if job.StartTime + arc.Period(3600) < arc.Time():
                            self.log.warning(
                                "%s: Assuming job %s is lost and marking as cancelled"
                                % (appjobid, job.JobID))
                            self.db.updateArcJob(
                                id, {
                                    "arcstate": "cancelled",
                                    "tarcstate": self.db.getTimeStamp()
                                })
                        else:
                            # Job has not yet reached info system
                            self.log.warning(
                                "%s: Job %s is not yet in info system so cannot be cancelled"
                                % (appjobid, job.JobID))
                    else:
                        self.log.error("%s: Could not cancel job %s" %
                                       (appjobid, job.JobID))
                        # Just to mark as cancelled so it can be cleaned
                        self.db.updateArcJob(
                            id, {
                                "arcstate": "cancelled",
                                "tarcstate": self.db.getTimeStamp()
                            })
                else:
                    self.db.updateArcJob(
                        id, {
                            "arcstate": "cancelling",
                            "tarcstate": self.db.getTimeStamp()
                        })
Beispiel #8
0
    def check_workers(self, workspec_list):
        retList = []
        for workspec in workspec_list:

            # make logger
            arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
            tmplog = arclog.log
            tmplog.info("checking worker id {0}".format(workspec.workerID))
            (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec)

            # Set certificate
            userconfig = arc.UserConfig(self.cred_type)
            try:
                userconfig.ProxyPath(str(self.certs[proxyrole]))
            except:
                tmplog.error("Job {0}: no proxy found with role {1}".format(
                    job.JobID, proxyrole))
                retList.append((workspec.status, ''))
                continue

            job_supervisor = arc.JobSupervisor(userconfig, [job])
            job_supervisor.Update()

            jobsupdated = job_supervisor.GetAllJobs()
            jobsnotupdated = job_supervisor.GetIDsNotProcessed()

            for updatedjob in jobsupdated:
                if updatedjob.JobID in jobsnotupdated:
                    tmplog.error("Failed to find information on {0}".format(
                        updatedjob.JobID))
                    # If missing for too long (2 days), mark as lost
                    if arc.Time() - modtime > arc.Period(172800):
                        tmplog.error(
                            "Job {0} missing for more than 2 days, marking as lost"
                            .format(updatedjob.JobID))
                        retList.append((workspec.ST_failed, ''))
                    else:
                        retList.append((workspec.status, ''))
                    continue

                # Convert arc state to WorkSpec state
                arcstatus = updatedjob.State
                newstatus = WorkSpec.ST_submitted
                if arcstatus == arc.JobState.RUNNING or \
                   arcstatus == arc.JobState.FINISHING:
                    newstatus = WorkSpec.ST_running
                elif arcstatus == arc.JobState.FINISHED:
                    if updatedjob.ExitCode == -1:
                        # Missing exit code, but assume success
                        tmplog.warning(
                            "Job {0} FINISHED but has missing exit code, setting to zero"
                            .format(updatedjob.JobID))
                        updatedjob.ExitCode = 0
                    newstatus = WorkSpec.ST_finished
                elif arcstatus == arc.JobState.FAILED:
                    newstatus = WorkSpec.ST_failed
                    tmplog.info("Job {0} failed: {1}".format(
                        updatedjob.JobID,
                        ";".join([joberr for joberr in updatedjob.Error])))
                elif arcstatus == arc.JobState.KILLED:
                    newstatus = WorkSpec.ST_cancelled
                elif arcstatus == arc.JobState.DELETED or \
                     arcstatus == arc.JobState.OTHER:
                    # unexpected
                    newstatus = WorkSpec.ST_failed
                # Not covered: arc.JobState.HOLD. Maybe need a post-run state in
                # harvester, also to cover FINISHING

                # compare strings here to get around limitations of JobState API
                if job.State.GetGeneralState(
                ) == updatedjob.State.GetGeneralState():
                    tmplog.debug("Job {0} still in state {1}".format(
                        job.JobID, job.State.GetGeneralState()))
                    retList.append((newstatus, ''))
                    continue

                tmplog.info("Job {0}: {1} -> {2} ({3})".format(
                    job.JobID, job.State.GetGeneralState(),
                    updatedjob.State.GetGeneralState(),
                    updatedjob.State.GetSpecificState()))

                arc_utils.arcjob2workspec(updatedjob, workspec)
                # Have to force update to change info in DB
                workspec.force_update('workAttributes')
                tmplog.debug("batchStatus {0} -> workerStatus {1}".format(
                    arcstatus.GetGeneralState(), newstatus))
                retList.append((newstatus, ''))

        return True, retList