Exemple #1
0
    def processToResubmit(self):

        if self.cluster:
            jobstoresubmit = self.db.getArcJobs(
                "arcstate='toresubmit' and cluster='" + self.cluster + "'")
        else:
            jobstoresubmit = self.db.getArcJobs(
                "arcstate='toresubmit' and clusterlist=''")

        for proxyid, jobs in jobstoresubmit.items():
            self.uc.CredentialString(str(self.db.getProxy(proxyid)))

            # Clean up jobs which were submitted
            jobstoclean = [job[2] for job in jobs if job[2].JobID]

            if jobstoclean:

                # Put all jobs to cancel, however the supervisor will only cancel
                # cancellable jobs and remove the rest so there has to be 2 calls
                # to Clean()
                job_supervisor = arc.JobSupervisor(self.uc, jobstoclean)
                job_supervisor.Update()
                self.log.info("Cancelling %i jobs" % len(jobstoclean))
                job_supervisor.Cancel()

                processed = job_supervisor.GetIDsProcessed()
                notprocessed = job_supervisor.GetIDsNotProcessed()
                # Clean the successfully cancelled jobs
                if processed:
                    job_supervisor.SelectByID(processed)
                    self.log.info("Cleaning %i jobs" % len(processed))
                    if not job_supervisor.Clean():
                        self.log.warning("Failed to clean some jobs")

                # New job supervisor with the uncancellable jobs
                if notprocessed:
                    notcancellable = [
                        job for job in jobstoclean if job.JobID in notprocessed
                    ]
                    job_supervisor = arc.JobSupervisor(self.uc, notcancellable)
                    job_supervisor.Update()

                    self.log.info("Cleaning %i jobs" % len(notcancellable))
                    if not job_supervisor.Clean():
                        self.log.warning("Failed to clean some jobs")

            # Empty job to reset DB info
            j = arc.Job()
            for (id, appjobid, job, created) in jobs:
                self.db.updateArcJob(
                    id, {
                        "arcstate": "tosubmit",
                        "tarcstate": self.db.getTimeStamp(),
                        "cluster": None
                    }, j)
Exemple #2
0
    def processToRerun(self):

        if not self.cluster:
            # Rerun only applies to job which have been submitted
            return

        jobstorerun = self.db.getArcJobs("arcstate='torerun' and cluster='" +
                                         self.cluster + "'")
        if not jobstorerun:
            return

        # TODO: downtimes from CRIC
        if self.conf.get(['downtime', 'srmdown']) == 'True':
            self.log.info('SRM down, not rerunning')
            return

        self.log.info("Resuming %i jobs" %
                      sum(len(v) for v in jobstorerun.values()))
        for proxyid, jobs in jobstorerun.items():
            self.uc.CredentialString(str(self.db.getProxy(proxyid)))

            job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs])
            job_supervisor.Update()
            # Renew proxy to be safe
            job_supervisor.Renew()
            job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs])
            job_supervisor.Update()
            job_supervisor.Resume()

            notresumed = job_supervisor.GetIDsNotProcessed()

            for (id, appjobid, job, created) in jobs:
                if job.JobID in notresumed:
                    self.log.error("%s: Could not resume job %s" %
                                   (appjobid, job.JobID))
                    self.db.updateArcJob(id, {
                        "arcstate": "failed",
                        "tarcstate": self.db.getTimeStamp()
                    })
                else:
                    # Force a wait before next status check, to allow the
                    # infosys to update and avoid the failed state being picked
                    # up again
                    self.db.updateArcJob(
                        id, {
                            "arcstate":
                            "finishing" if job.RestartState
                            == arc.JobState.FINISHING else 'submitted',
                            "tarcstate":
                            self.db.getTimeStamp(time.time() + 3600)
                        })
Exemple #3
0
def example():
    # Creating a UserConfig object with the user's proxy
    # and the path of the trusted CA certificates
    uc = arc.UserConfig()
    uc.ProxyPath("/tmp/x509up_u%s" % os.getuid())
    uc.CACertificatesDirectory("/etc/grid-security/certificates")

    # Create a new job object with a given JobID
    job = arc.Job()
    job.JobID = "https://piff.hep.lu.se:443/arex/hYDLDmyxvUfn5h5iWqkutBwoABFKDmABFKDmIpHKDmYBFKDmtRy9En"
    job.Flavour = "ARC1"
    job.ServiceInformationURL = job.JobStatusURL = job.JobManagementURL = arc.URL("https://piff.hep.lu.se:443/arex")

    sys.stdout.write("Get job information from the computing element...\n")
    # Put the job into a JobSupervisor and update its information
    job_supervisor = arc.JobSupervisor(uc, [job])
    job_supervisor.Update()

    sys.stdout.write("Downloading results...\n")
    # Prepare a list for storing the directories for the downloaded job results (if there would be more jobs)
    downloadeddirectories = arc.StringList()
    # Start retrieving results of all the selected jobs
    #   into the "/tmp" directory (first argument)
    #   using the jobid and not the jobname as the name of the subdirectory (second argument, usejobname = False)
    #   do not overwrite existing directories with the same name (third argument: force = False)
    #   collect the downloaded directories into the variable "downloadeddirectories" (forth argument)
    success = job_supervisor.Retrieve("/tmp", False, False, downloadeddirectories)
    if not success:
        sys.stdout.write("Downloading results failed.\n")
    for downloadeddirectory in downloadeddirectories:
        sys.stdout.write("Job results were downloaded to %s\n"%str(downloadeddirectory))
        sys.stdout.write("Contents of the directory:\n")
        for filename in os.listdir(downloadeddirectory):
            sys.stdout.write("   %s\n"%filename)
Exemple #4
0
  def killJob(self, jobIDList):
    """ Kill the specified jobs
    """

    result = self._prepareProxy()
    if not result['OK']:
      gLogger.error('ARCComputingElement: failed to set up proxy', result['Message'])
      return result
    self.usercfg.ProxyPath(os.environ['X509_USER_PROXY'])

    jobList = list(jobIDList)
    if isinstance(jobIDList, basestring):
      jobList = [jobIDList]

    gLogger.debug("Killing jobs %s" % jobIDList)
    jobs = []
    for jobID in jobList:
      jobs.append(self.__getARCJob(jobID))

    # JobSupervisor is able to aggregate jobs to perform bulk operations and thus minimizes the communication overhead
    job_supervisor = arc.JobSupervisor(self.usercfg, jobs)
    if not job_supervisor.Cancel():
      errorString = ' - '.join(jobList).strip()
      return S_ERROR('Failed to kill at least one of these jobs: %s. CE(?) not reachable?' % errorString)

    return S_OK()
Exemple #5
0
    def processToClean(self):

        jobstoclean = self.db.getArcJobs("arcstate='toclean' and cluster='" +
                                         self.cluster + "' limit 100")

        if not jobstoclean:
            return

        self.log.info("Cleaning %d jobs" %
                      sum(len(v) for v in jobstoclean.values()))
        for proxyid, jobs in jobstoclean.items():
            self.uc.CredentialString(str(self.db.getProxy(proxyid)))

            job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs])
            job_supervisor.Update()
            job_supervisor.Clean()

            notcleaned = job_supervisor.GetIDsNotProcessed()

            for (id, appjobid, job, created) in jobs:
                if job.JobID in notcleaned:
                    self.log.error("%s: Could not clean job %s" %
                                   (appjobid, job.JobID))

                self.db.deleteArcJob(id)
Exemple #6
0
  def killJob(self, jobIDList):
    """ Kill the specified jobs
    """

    result = self._prepareProxy()
    if not result['OK']:
      self.log.error('ARCComputingElement: failed to set up proxy', result['Message'])
      return result
    self.usercfg.ProxyPath(os.environ['X509_USER_PROXY'])

    jobList = list(jobIDList)
    if isinstance(jobIDList, six.string_types):
      jobList = [jobIDList]

    self.log.debug("Killing jobs %s" % jobIDList)
    jobs = []
    for jobID in jobList:
      jobs.append(self.__getARCJob(jobID))

    # JobSupervisor is able to aggregate jobs to perform bulk operations and thus minimizes the communication overhead
    # We still need to create chunks to avoid timeout in the case there are too many jobs to supervise
    for chunk in breakListIntoChunks(jobs, 100):
      job_supervisor = arc.JobSupervisor(self.usercfg, chunk)
      if not job_supervisor.Cancel():
        errorString = ' - '.join(jobList).strip()
        return S_ERROR('Failed to kill at least one of these jobs: %s. CE(?) not reachable?' % errorString)

    return S_OK()
Exemple #7
0
def example():
    # Creating a UserConfig object with the user's proxy
    # and the path of the trusted CA certificates
    uc = arc.UserConfig()
    uc.ProxyPath("/tmp/x509up_u%s" % os.getuid())
    uc.CACertificatesDirectory("/etc/grid-security/certificates")

    # Create a new job object with a given JobID
    job = arc.Job()
    job.JobID = "https://piff.hep.lu.se:443/arex/1QuMDmRwvUfn5h5iWqkutBwoABFKDmABFKDmIpHKDmXBFKDmIuAean"
    job.Flavour = "ARC1"
    job.JobManagementURL = arc.URL("https://piff.hep.lu.se:443/arex")
    job.JobStatusURL = arc.URL("https://piff.hep.lu.se:443/arex")

    sys.stdout.write("Job object before update:\n")
    job.SaveToStream(arc.CPyOstream(sys.stdout), True)

    job_supervisor = arc.JobSupervisor(uc, [job])

    # Update the states of jobs within this JobSupervisor
    job_supervisor.Update()

    # Get our updated job from the JobSupervisor
    jobs = job_supervisor.GetAllJobs()
    job = jobs[0]

    sys.stdout.write("Job object after update:\n")
    job.SaveToStream(arc.CPyOstream(sys.stdout), True)
Exemple #8
0
    def test_resubmit(self):
        self.usercfg.Broker("TEST")

        arc.TargetInformationRetrieverPluginTESTControl.targets = [
            self.create_test_target("http://test2.nordugrid.org")
        ]
        arc.TargetInformationRetrieverPluginTESTControl.status = arc.EndpointQueryingStatus(
            arc.EndpointQueryingStatus.SUCCESSFUL)

        js = arc.JobSupervisor(self.usercfg, [
            self.create_test_job(
                job_id="http://test.nordugrid.org/1234567890test1",
                state=arc.JobState.FAILED),
            self.create_test_job(
                job_id="http://test.nordugrid.org/1234567890test2",
                state=arc.JobState.RUNNING)
        ])

        self.expect(js.GetAllJobs()).to_have(2).jobs()

        endpoints = [
            arc.Endpoint("http://test2.nordugrid.org",
                         arc.Endpoint.COMPUTINGINFO, "org.nordugrid.tirtest")
        ]
        resubmitted = arc.JobList()
        result = js.Resubmit(0, endpoints, resubmitted)
Exemple #9
0
def example():
    uc = arc.UserConfig()

    # Create a JobSupervisor to handle all the jobs
    job_supervisor = arc.JobSupervisor(uc)

    # Retrieve all the jobs from this computing element
    endpoint = arc.Endpoint("https://piff.hep.lu.se:443/arex",
                            arc.Endpoint.JOBLIST)
    sys.stdout.write("Querying %s for jobs...\n" % endpoint.str())
    retriever = arc.JobListRetriever(uc)
    retriever.addConsumer(job_supervisor)
    retriever.addEndpoint(endpoint)
    retriever.wait()

    sys.stdout.write("%s jobs found\n" % len(job_supervisor.GetAllJobs()))

    sys.stdout.write("Getting job states...\n")
    # Update the states of the jobs
    job_supervisor.Update()

    # Print state of updated jobs
    sys.stdout.write("The jobs have the following states: %s\n" % (", ".join(
        [job.State.GetGeneralState() for job in job_supervisor.GetAllJobs()])))

    # Select failed jobs
    job_supervisor.SelectByStatus(["Failed"])
    failed_jobs = job_supervisor.GetSelectedJobs()

    sys.stdout.write("The failed jobs:\n")
    for job in failed_jobs:
        job.SaveToStream(arc.CPyOstream(sys.stdout), True)
Exemple #10
0
    def get_job(self, job_id):
        """
        Return an instance of ``arc.Job`` representing the job with the given ID

        :param job_id:            ID of the job as returned by `submit_job`
        :raises JobNotFoundError: if no job with the given ID could be found
        :return:                  Instance of ``arc.Job`` representing the job
        """
        user_config = self.get_user_config()

        # Create a JobSupervisor to handle all the jobs
        job_supervisor = arc.JobSupervisor(user_config)

        # Retrieve all the jobs from this computing element
        endpoint = arc.Endpoint(self.config.ARC_SERVER, arc.Endpoint.JOBLIST)
        retriever = arc.JobListRetriever(user_config)
        retriever.addConsumer(job_supervisor)
        retriever.addEndpoint(endpoint)
        retriever.wait()

        # Update the states of the jobs
        job_supervisor.Update()

        # Get all jobs and find job by ID
        jobs = job_supervisor.GetAllJobs()

        for job in jobs:
            if job.JobID == job_id:
                return job

        raise JobNotFoundError(
            "Could not find a job with ID '{}'".format(job_id))
Exemple #11
0
def example():
    # Creating a UserConfig object with the user's proxy
    # and the path of the trusted CA certificates
    uc = arc.UserConfig()
    uc.ProxyPath("/tmp/x509up_u%s" % os.getuid())
    uc.CACertificatesDirectory("/etc/grid-security/certificates")

    # Create a new job object with a given JobID
    job = arc.Job()
    job.JobID = "https://piff.hep.lu.se:443/arex/w7LNDmSkEiun1ZPzno6AuCjpABFKDmABFKDmZ9LKDmUBFKDmXugZwm"
    job.IDFromEndpoint = "w7LNDmSkEiun1ZPzno6AuCjpABFKDmABFKDmZ9LKDmUBFKDmXugZwm"
    job.JobManagementURL = arc.URL("https://piff.hep.lu.se:443/arex")
    job.JobStatusURL = arc.URL("https://piff.hep.lu.se:443/arex")
    job.JobStatusInterfaceName = 'org.ogf.glue.emies.activitymanagement'
    job.JobManagementInterfaceName = 'org.ogf.glue.emies.activitymanagement'

    sys.stdout.write("Job object before update:\n")
    job.SaveToStream(arc.CPyOstream(sys.stdout), True)

    job_supervisor = arc.JobSupervisor(uc, [job])

    # Update the states of jobs within this JobSupervisor
    job_supervisor.Update()

    # Get our updated job from the JobSupervisor
    jobs = job_supervisor.GetAllJobs()
    if not jobs:
        sys.stdout.write("No jobs found\n")
        return

    job = jobs[0]

    sys.stdout.write("Job object after update:\n")
    job.SaveToStream(arc.CPyOstream(sys.stdout), True)
Exemple #12
0
    def fetchAll(self, jobs):

        # Get all outputs using Job Supervisor
        job_supervisor = arc.JobSupervisor(self.uc, list(jobs.values()))
        job_supervisor.Update()
        dirs = arc.StringList()
        job_supervisor.Retrieve(self.tmpdir, False, False, dirs)

        return (list(job_supervisor.GetIDsProcessed()),
                list(job_supervisor.GetIDsNotProcessed()))
    def kill_worker(self, workspec):
        """Cancel the ARC job.

        :param workspec: worker specification
        :type workspec: WorkSpec
        :return: A tuple of return code (True for success, False otherwise) and error dialog
        :rtype: (bool, string)
        """

        # make logger
        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmplog = arclog.log

        (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec)
        if not job.JobID:
            # Job not submitted
            tmplog.info("Job was not submitted so cannot be cancelled")
            return True, ''

        # Set certificate
        userconfig = arc.UserConfig(self.cred_type)
        try:
            userconfig.ProxyPath(str(self.certs[proxyrole]))
        except:
            # Log a warning and return True so that job can be cleaned
            tmplog.warning("Job {0}: no proxy found with role {1}".format(
                job.JobID, proxyrole))
            return True, ''

        job_supervisor = arc.JobSupervisor(userconfig, [job])
        job_supervisor.Update()
        job_supervisor.Cancel()

        notcancelled = job_supervisor.GetIDsNotProcessed()

        if job.JobID in notcancelled:
            if job.State == arc.JobState.UNDEFINED:
                # If longer than one hour since submission assume job never made it
                if job.SubmissionTime + arc.Period(3600) < arc.Time():
                    tmplog.warning(
                        "Assuming job is lost and marking as cancelled")
                    return True, ''

                # Job has not yet reached info system
                tmplog.warning(
                    "Job is not yet in info system so cannot be cancelled")
                return False, "Job is not yet in info system so could not be cancelled"

            # Log a warning and return True so that job can be cleaned
            tmplog.warning("Job could not be cancelled")
            return True, ''

        tmplog.info("Job cancelled successfully")
        return True, ''
Exemple #14
0
    def test_cancel(self):
        id1 = "http://test.nordugrid.org/1234567890test1"
        id2 = "http://test.nordugrid.org/1234567890test2"
        id3 = "http://test.nordugrid.org/1234567890test3"
        id4 = "http://test.nordugrid.org/1234567890test4"
        js = arc.JobSupervisor(self.usercfg, [
            self.create_test_job(job_id=id1, state=arc.JobState.RUNNING),
            self.create_test_job(job_id=id2, state=arc.JobState.FINISHED),
            self.create_test_job(job_id=id3, state=arc.JobState.UNDEFINED)
        ])

        arc.JobControllerPluginTestACCControl.cancelStatus = True
        self.expect(js.Cancel()).to_be(
            True, message="Cancel was expected to return True")
        self.expect(js.GetIDsProcessed()).to_have(1).ID()
        self.expect(js.GetIDsProcessed()[0]).to_be(id1)
        self.expect(js.GetIDsNotProcessed()).to_have(2).IDs()
        self.expect(js.GetIDsNotProcessed()[0]).to_be(id2)
        self.expect(js.GetIDsNotProcessed()[1]).to_be(id3)
        js.ClearSelection()

        arc.JobControllerPluginTestACCControl.cancelStatus = False
        self.expect(js.Cancel()).to_be(
            False, message="Cancel was expected to return False")
        self.expect(js.GetIDsProcessed()).to_have(0).IDs()
        self.expect(js.GetIDsNotProcessed()).to_have(3).IDs()
        self.expect(js.GetIDsNotProcessed()[0]).to_be(id1)
        self.expect(js.GetIDsNotProcessed()[1]).to_be(id2)
        self.expect(js.GetIDsNotProcessed()[2]).to_be(id3)
        js.ClearSelection()

        job = self.create_test_job(job_id=id4,
                                   state=arc.JobState.ACCEPTED,
                                   state_text="Accepted")
        self.expect(js.AddJob(job)).to_be(
            True, message="AddJob was expected to return True")

        arc.JobControllerPluginTestACCControl.cancelStatus = True
        js.SelectByStatus(["Accepted"])
        self.expect(js.Cancel()).to_be(
            True, message="Cancel was expected to return False")
        self.expect(js.GetIDsProcessed()).to_have(1).ID()
        self.expect(js.GetIDsProcessed()[0]).to_be(id4)
        self.expect(js.GetIDsNotProcessed()).to_have(0).IDs()
        js.ClearSelection()

        arc.JobControllerPluginTestACCControl.cancelStatus = False
        js.SelectByStatus(["Accepted"])
        self.expect(js.Cancel()).to_be(
            False, message="Cancel was expected to return False")
        self.expect(js.GetIDsProcessed()).to_have(0).IDs()
        self.expect(js.GetIDsNotProcessed()).to_have(1).ID()
        self.expect(js.GetIDsNotProcessed()[0]).to_be(id4)
        js.ClearSelection()
Exemple #15
0
    def test_constructor(self):
        id1 = "http://test.nordugrid.org/1234567890test1"
        id2 = "http://test.nordugrid.org/1234567890test2"
        js = arc.JobSupervisor(self.usercfg, [
            self.create_test_job(job_id=id1),
            self.create_test_job(job_id=id2)
        ])
        self.expect(js.GetAllJobs()).not_to_be_empty()

        jobs = js.GetAllJobs()
        self.expect(jobs).to_have(2).jobs()

        self.expect(jobs[0].JobID).to_be(id1)
        self.expect(jobs[1].JobID).to_be(id2)
Exemple #16
0
    def test_clean(self):
        id1 = "http://test.nordugrid.org/1234567890test1"
        id2 = "http://test.nordugrid.org/1234567890test2"
        js = arc.JobSupervisor(self.usercfg, [
            self.create_test_job(job_id=id1,
                                 state=arc.JobState.FINISHED,
                                 state_text="Finished"),
            self.create_test_job(job_id=id2, state=arc.JobState.UNDEFINED)
        ])
        self.expect(js.GetAllJobs()).to_have(2).jobs()

        arc.JobControllerPluginTestACCControl.cleanStatus = True
        self.expect(js.Clean()).to_be(
            True, message="Clean was expected to return True")
        self.expect(js.GetIDsProcessed()).to_have(1).ID()
        self.expect(js.GetIDsProcessed()[0]).to_be(id1)
        self.expect(js.GetIDsNotProcessed()).to_have(1).ID()
        self.expect(js.GetIDsNotProcessed()[0]).to_be(id2)
        js.ClearSelection()

        arc.JobControllerPluginTestACCControl.cleanStatus = False
        self.expect(js.Clean()).to_be(
            False, message="Clean was expected to return False")
        self.expect(js.GetIDsProcessed()).to_have(0).IDs()
        self.expect(js.GetIDsNotProcessed()).to_have(2).IDs()
        self.expect(js.GetIDsNotProcessed()[0]).to_be(id1)
        self.expect(js.GetIDsNotProcessed()[1]).to_be(id2)
        js.ClearSelection()

        arc.JobControllerPluginTestACCControl.cleanStatus = True
        js.SelectByStatus(["Finished"])
        self.expect(js.Clean()).to_be(
            True, message="Clean was expected to return True")
        self.expect(js.GetIDsProcessed()).to_have(1).ID()
        self.expect(js.GetIDsProcessed()[0]).to_be(id1)
        self.expect(js.GetIDsNotProcessed()).to_have(0).IDs()
        js.ClearSelection()

        arc.JobControllerPluginTestACCControl.cleanStatus = False
        js.SelectByStatus(["Finished"])
        self.expect(js.Clean()).to_be(
            False, message="Clean was expected to return False")
        self.expect(js.GetIDsProcessed()).to_have(0).IDs()
        self.expect(js.GetIDsNotProcessed()).to_have(1).ID()
        self.expect(js.GetIDsNotProcessed()[0]).to_be(id1)
        js.ClearSelection()
Exemple #17
0
    def test_add_job(self):
        js = arc.JobSupervisor(self.usercfg, arc.JobList())
        self.expect(js.GetAllJobs()).to_be_empty()

        job = self.create_test_job(
            job_id="http://test.nordugrid.org/1234567890test1")
        self.expect(js.AddJob(job)).to_be(
            True, message="AddJob was expected to return True")
        self.expect(js.GetAllJobs()).not_to_be_empty()

        job.JobManagementInterfaceName = ""
        self.expect(js.AddJob(job)).to_be(
            False, message="AddJob was expected to return False")
        self.expect(js.GetAllJobs()).to_have(1).job()

        job.JobManagementInterfaceName = "non.existent.interface"
        self.expect(js.AddJob(job)).to_be(
            False, message="AddJob was expected to return False")
        self.expect(js.GetAllJobs()).to_have(1).job()
    def sweep_worker(self, workspec):
        """Clean the ARC job

        :param workspec: worker specification
        :type workspec: WorkSpec
        :return: A tuple of return code (True for success, False otherwise) and error dialog
        :rtype: (bool, string)
        """

        # make logger
        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmplog = arclog.log

        (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec)
        if not job.JobID:
            # Job not submitted
            tmplog.info("Job was not submitted so cannot be cleaned")
            return True, ''

        # Set certificate
        userconfig = arc.UserConfig(self.cred_type)
        try:
            userconfig.ProxyPath(str(self.certs[proxyrole]))
        except:
            # Log a warning and return True so that job can be cleaned
            tmplog.warning("Job {0}: no proxy found with role {1}".format(
                job.JobID, proxyrole))
            return True, ''

        job_supervisor = arc.JobSupervisor(userconfig, [job])
        job_supervisor.Update()
        job_supervisor.Clean()

        notcleaned = job_supervisor.GetIDsNotProcessed()

        if job.JobID in notcleaned:
            # Log a warning and return True so that job can be finished
            tmplog.warning("Job could not be cleaned")
            return True, ''

        tmplog.info("Job cleaned successfully")
        return True, ''
Exemple #19
0
    def processToCancel(self):

        jobstocancel = self.db.getArcJobs("arcstate='tocancel' and cluster='" +
                                          self.cluster + "'")
        if not jobstocancel:
            return

        self.log.info("Cancelling %i jobs" %
                      sum(len(v) for v in jobstocancel.values()))
        for proxyid, jobs in jobstocancel.items():
            self.uc.CredentialString(self.db.getProxy(proxyid))

            job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs])
            job_supervisor.Update()
            job_supervisor.Cancel()

            notcancelled = job_supervisor.GetIDsNotProcessed()

            for (id, appjobid, job, created) in jobs:

                if not job.JobID:
                    # Job not submitted
                    self.log.info("%s: Marking unsubmitted job cancelled" %
                                  appjobid)
                    self.db.updateArcJob(
                        id, {
                            "arcstate": "cancelled",
                            "tarcstate": self.db.getTimeStamp()
                        })

                elif job.JobID in notcancelled:
                    if job.State == arc.JobState.UNDEFINED:
                        # If longer than one hour since submission assume job never made it
                        if job.StartTime + arc.Period(3600) < arc.Time():
                            self.log.warning(
                                "%s: Assuming job %s is lost and marking as cancelled"
                                % (appjobid, job.JobID))
                            self.db.updateArcJob(
                                id, {
                                    "arcstate": "cancelled",
                                    "tarcstate": self.db.getTimeStamp()
                                })
                        else:
                            # Job has not yet reached info system
                            self.log.warning(
                                "%s: Job %s is not yet in info system so cannot be cancelled"
                                % (appjobid, job.JobID))
                    else:
                        self.log.error("%s: Could not cancel job %s" %
                                       (appjobid, job.JobID))
                        # Just to mark as cancelled so it can be cleaned
                        self.db.updateArcJob(
                            id, {
                                "arcstate": "cancelled",
                                "tarcstate": self.db.getTimeStamp()
                            })
                else:
                    self.db.updateArcJob(
                        id, {
                            "arcstate": "cancelling",
                            "tarcstate": self.db.getTimeStamp()
                        })
Exemple #20
0
  def getJobStatus(self, jobIDList):
    """ Get the status information for the given list of jobs
    """

    result = self._prepareProxy()
    if not result['OK']:
      self.log.error('ARCComputingElement: failed to set up proxy', result['Message'])
      return result
    self.usercfg.ProxyPath(os.environ['X509_USER_PROXY'])

    jobTmpList = list(jobIDList)
    if isinstance(jobIDList, six.string_types):
      jobTmpList = [jobIDList]

    # Pilots are stored with a DIRAC stamp (":::XXXXX") appended
    jobList = []
    for j in jobTmpList:
      if ":::" in j:
        job = j.split(":::")[0]
      else:
        job = j
      jobList.append(job)

    jobs = []
    for jobID in jobList:
      jobs.append(self.__getARCJob(jobID))

    # JobSupervisor is able to aggregate jobs to perform bulk operations and thus minimizes the communication overhead
    # We still need to create chunks to avoid timeout in the case there are too many jobs to supervise
    jobsUpdated = []
    for chunk in breakListIntoChunks(jobs, 100):
      job_supervisor = arc.JobSupervisor(self.usercfg, chunk)
      job_supervisor.Update()
      jobsUpdated.extend(job_supervisor.GetAllJobs())

    resultDict = {}
    jobsToRenew = []
    jobsToCancel = []
    for job in jobsUpdated:
      jobID = job.JobID
      self.log.debug("Retrieving status for job %s" % jobID)
      arcState = job.State.GetGeneralState()
      self.log.debug("ARC status for job %s is %s" % (jobID, arcState))
      if arcState:  # Meaning arcState is filled. Is this good python?
        resultDict[jobID] = self.mapStates[arcState]
        # Renew proxy only of jobs which are running or queuing
        if arcState in ("Running", "Queuing"):
          nearExpiry = arc.Time() + arc.Period(10000)  # 2 hours, 46 minutes and 40 seconds
          if job.ProxyExpirationTime < nearExpiry:
            # Jobs to renew are aggregated to perform bulk operations
            jobsToRenew.append(job)
            self.log.debug("Renewing proxy for job %s whose proxy expires at %s" % (jobID, job.ProxyExpirationTime))
        if arcState == "Hold":
          # Jobs to cancel are aggregated to perform bulk operations
          # Cancel held jobs so they don't sit in the queue forever
          jobsToCancel.append(job)
          self.log.debug("Killing held job %s" % jobID)
      else:
        resultDict[jobID] = 'Unknown'
      # If done - is it really done? Check the exit code
      if resultDict[jobID] == "Done":
        exitCode = int(job.ExitCode)
        if exitCode:
          resultDict[jobID] = "Failed"
      self.log.debug("DIRAC status for job %s is %s" % (jobID, resultDict[jobID]))

    # JobSupervisor is able to aggregate jobs to perform bulk operations and thus minimizes the communication overhead
    # We still need to create chunks to avoid timeout in the case there are too many jobs to supervise
    for chunk in breakListIntoChunks(jobsToRenew, 100):
      job_supervisor_renew = arc.JobSupervisor(self.usercfg, chunk)
      if not job_supervisor_renew.Renew():
        self.log.warn('At least one of the jobs failed to renew its credentials')

    for chunk in breakListIntoChunks(jobsToCancel, 100):
      job_supervisor_cancel = arc.JobSupervisor(self.usercfg, chunk)
      if not job_supervisor_cancel.Cancel():
        self.log.warn('At least one of the jobs failed to be cancelled')

    if not resultDict:
      return S_ERROR('No job statuses returned')

    return S_OK(resultDict)
Exemple #21
0
    def check_workers(self, workspec_list):
        retList = []
        for workspec in workspec_list:

            # make logger
            arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
            tmplog = arclog.log
            tmplog.info("checking worker id {0}".format(workspec.workerID))
            (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec)

            # Set certificate
            userconfig = arc.UserConfig(self.cred_type)
            try:
                userconfig.ProxyPath(str(self.certs[proxyrole]))
            except:
                tmplog.error("Job {0}: no proxy found with role {1}".format(
                    job.JobID, proxyrole))
                retList.append((workspec.status, ''))
                continue

            job_supervisor = arc.JobSupervisor(userconfig, [job])
            job_supervisor.Update()

            jobsupdated = job_supervisor.GetAllJobs()
            jobsnotupdated = job_supervisor.GetIDsNotProcessed()

            for updatedjob in jobsupdated:
                if updatedjob.JobID in jobsnotupdated:
                    tmplog.error("Failed to find information on {0}".format(
                        updatedjob.JobID))
                    # If missing for too long (2 days), mark as lost
                    if arc.Time() - modtime > arc.Period(172800):
                        tmplog.error(
                            "Job {0} missing for more than 2 days, marking as lost"
                            .format(updatedjob.JobID))
                        retList.append((workspec.ST_failed, ''))
                    else:
                        retList.append((workspec.status, ''))
                    continue

                # Convert arc state to WorkSpec state
                arcstatus = updatedjob.State
                newstatus = WorkSpec.ST_submitted
                if arcstatus == arc.JobState.RUNNING or \
                   arcstatus == arc.JobState.FINISHING:
                    newstatus = WorkSpec.ST_running
                elif arcstatus == arc.JobState.FINISHED:
                    if updatedjob.ExitCode == -1:
                        # Missing exit code, but assume success
                        tmplog.warning(
                            "Job {0} FINISHED but has missing exit code, setting to zero"
                            .format(updatedjob.JobID))
                        updatedjob.ExitCode = 0
                    newstatus = WorkSpec.ST_finished
                elif arcstatus == arc.JobState.FAILED:
                    newstatus = WorkSpec.ST_failed
                    tmplog.info("Job {0} failed: {1}".format(
                        updatedjob.JobID,
                        ";".join([joberr for joberr in updatedjob.Error])))
                elif arcstatus == arc.JobState.KILLED:
                    newstatus = WorkSpec.ST_cancelled
                elif arcstatus == arc.JobState.DELETED or \
                     arcstatus == arc.JobState.OTHER:
                    # unexpected
                    newstatus = WorkSpec.ST_failed
                # Not covered: arc.JobState.HOLD. Maybe need a post-run state in
                # harvester, also to cover FINISHING

                # compare strings here to get around limitations of JobState API
                if job.State.GetGeneralState(
                ) == updatedjob.State.GetGeneralState():
                    tmplog.debug("Job {0} still in state {1}".format(
                        job.JobID, job.State.GetGeneralState()))
                    retList.append((newstatus, ''))
                    continue

                tmplog.info("Job {0}: {1} -> {2} ({3})".format(
                    job.JobID, job.State.GetGeneralState(),
                    updatedjob.State.GetGeneralState(),
                    updatedjob.State.GetSpecificState()))

                arc_utils.arcjob2workspec(updatedjob, workspec)
                # Have to force update to change info in DB
                workspec.force_update('workAttributes')
                tmplog.debug("batchStatus {0} -> workerStatus {1}".format(
                    arcstatus.GetGeneralState(), newstatus))
                retList.append((newstatus, ''))

        return True, retList
Exemple #22
0
    def checkJobs(self):
        '''
        Query all running jobs
        '''

        # minimum time between checks
        if time.time() < self.checktime + int(
                self.conf.get(['jobs', 'checkmintime'])):
            self.log.debug("mininterval not reached")
            return
        self.checktime = time.time()

        # check jobs which were last checked more than checkinterval ago
        jobstocheck=self.db.getArcJobs("arcstate in ('submitted', 'running', 'finishing', 'cancelling', 'holding') and " \
                                       "jobid not like '' and cluster='"+self.cluster+"' and "+ \
                                       self.db.timeStampLessThan("tarcstate", self.conf.get(['jobs','checkinterval'])) + \
                                       " limit 100000")

        njobstocheck = sum(len(v) for v in jobstocheck.values())
        if not njobstocheck:
            return
        self.log.info("%d jobs to check" % njobstocheck)
        self.resetJobs(jobstocheck)

        # Loop over proxies
        for proxyid, jobs in jobstocheck.items():
            self.uc.CredentialString(str(self.db.getProxy(proxyid)))

            job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs])
            job_supervisor.Update()
            jobsupdated = job_supervisor.GetAllJobs()
            jobsnotupdated = job_supervisor.GetIDsNotProcessed()

            for (originaljobinfo, updatedjob) in zip(jobs, jobsupdated):
                (id, appjobid, originaljob, created) = originaljobinfo
                if updatedjob.JobID in jobsnotupdated:
                    self.log.error("%s: Failed to find information on %s" %
                                   (appjobid, updatedjob.JobID))
                    continue
                if updatedjob.JobID != originaljob.JobID:
                    # something went wrong with list order
                    self.log.warning(
                        "%s: Bad job id (%s), expected %s" %
                        (appjobid, updatedjob.JobID, originaljob.JobID))
                    continue
                # compare strings here to get around limitations of JobState API
                # map INLRMS:S and O to HOLD (not necessary when ARC 4.1 is used)
                if updatedjob.State.GetGeneralState() == 'Queuing' and (
                        updatedjob.State.GetSpecificState() == 'INLRMS:S'
                        or updatedjob.State.GetSpecificState() == 'INLRMS:O'):
                    updatedjob.State = arc.JobState('Hold')
                if originaljob.State.GetGeneralState() == updatedjob.State.GetGeneralState() \
                     and self.cluster not in ['gsiftp://gar-ex-etpgrid1.garching.physik.uni-muenchen.de:2811/preempt', 'gsiftp://arc1-it4i.farm.particle.cz/qfree', 'gsiftp://arc2-it4i.farm.particle.cz/qfree']:
                    # just update timestamp
                    # Update numbers every time for superMUC since walltime is missing for finished jobs
                    self.db.updateArcJob(id,
                                         {'tarcstate': self.db.getTimeStamp()})
                    continue

                self.log.info("%s: Job %s: %s -> %s (%s)" %
                              (appjobid, originaljob.JobID,
                               originaljob.State.GetGeneralState(),
                               updatedjob.State.GetGeneralState(),
                               updatedjob.State.GetSpecificState()))

                # state changed, update whole Job object
                arcstate = 'submitted'
                if updatedjob.State == arc.JobState.FINISHED:
                    if updatedjob.ExitCode == -1:
                        # Missing exit code, but assume success
                        self.log.warning(
                            "%s: Job %s FINISHED but has missing exit code, setting to zero"
                            % (appjobid, updatedjob.JobID))
                        updatedjob.ExitCode = 0
                    arcstate = 'finished'
                    self.log.debug(
                        '%s: reported walltime %d, cputime %d' %
                        (appjobid, updatedjob.UsedTotalWallTime.GetPeriod(),
                         updatedjob.UsedTotalCPUTime.GetPeriod()))
                elif updatedjob.State == arc.JobState.FAILED:
                    # EMI-ES reports cancelled jobs as failed so check substate (this is fixed in ARC 6.8)
                    if 'cancel' in updatedjob.State.GetSpecificState():
                        arcstate = 'cancelled'
                    else:
                        arcstate = self.processJobErrors(
                            id, appjobid, updatedjob)
                elif updatedjob.State == arc.JobState.KILLED:
                    arcstate = 'cancelled'
                elif updatedjob.State == arc.JobState.RUNNING:
                    arcstate = 'running'
                elif updatedjob.State == arc.JobState.FINISHING:
                    arcstate = 'finishing'
                elif updatedjob.State == arc.JobState.HOLD:
                    arcstate = 'holding'
                elif updatedjob.State == arc.JobState.DELETED or \
                     updatedjob.State == arc.JobState.OTHER:
                    # unexpected
                    arcstate = 'failed'

                # Walltime reported by ARC 6 is multiplied by cores
                if arc.ARC_VERSION_MAJOR >= 6 and updatedjob.RequestedSlots > 0:
                    updatedjob.UsedTotalWallTime = arc.Period(
                        updatedjob.UsedTotalWallTime.GetPeriod() //
                        updatedjob.RequestedSlots)
                # Fix crazy wallclock and CPU times
                if updatedjob.UsedTotalWallTime > arc.Time() - arc.Time(
                        int(created.strftime("%s"))):
                    fixedwalltime = arc.Time() - arc.Time(
                        int(created.strftime("%s")))
                    self.log.warning(
                        "%s: Fixing reported walltime %d to %d" %
                        (appjobid, updatedjob.UsedTotalWallTime.GetPeriod(),
                         fixedwalltime.GetPeriod()))
                    updatedjob.UsedTotalWallTime = fixedwalltime
                if updatedjob.UsedTotalCPUTime > arc.Period(10**7):
                    self.log.warning(
                        "%s: Discarding reported CPUtime %d" %
                        (appjobid, updatedjob.UsedTotalCPUTime.GetPeriod()))
                    updatedjob.UsedTotalCPUTime = arc.Period(-1)
                self.db.updateArcJob(
                    id, {
                        'arcstate': arcstate,
                        'tarcstate': self.db.getTimeStamp(),
                        'tstate': self.db.getTimeStamp()
                    }, updatedjob)

        self.log.info('Done')