Example #1
0
 def checkDiracProxy():
     # make sure proxy is valid
     if not _proxyValid(shouldRenew = False, shouldRaise = False):
         if DiracBase.dirac_monitoring_is_active is True:
             logger.warning('DIRAC monitoring inactive (no valid proxy found).')
             logger.warning('Type: \'gridProxy.renew()\' to (re-)activate')
         DiracBase.dirac_monitoring_is_active = False
     else:
         DiracBase.dirac_monitoring_is_active = True
     return DiracBase.dirac_monitoring_is_active
Example #2
0
 def checkDiracProxy():
     # make sure proxy is valid
     if not _proxyValid(shouldRenew = False, shouldRaise = False):
         if DiracBase.dirac_monitoring_is_active is True:
             logger.warning('DIRAC monitoring inactive (no valid proxy found).')
             logger.warning('Type: \'gridProxy.renew()\' to (re-)activate')
         DiracBase.dirac_monitoring_is_active = False
     else:
         DiracBase.dirac_monitoring_is_active = True
     return DiracBase.dirac_monitoring_is_active
Example #3
0
    def updateMonitoringInformation(_jobs):
        """Check the status of jobs and retrieve output sandboxes"""
        # Only those jobs in 'submitted','running' are passed in here for checking
        # if however they have already completed in Dirac they may have been put on queue
        # for processing from last time. These should be put back on queue without
        # querying dirac again. Their signature is status = running and job.backend.status
        # already set to Done or Failed etc.

        jobs = [stripProxy(j) for j in _jobs]

        logger = getLogger()

        # make sure proxy is valid
        if not _proxyValid():
            if DiracBase.dirac_monitoring_is_active:
                logger.warning('DIRAC monitoring inactive (no valid proxy found).')
                DiracBase.dirac_monitoring_is_active = False
            return
        else:
            DiracBase.dirac_monitoring_is_active = True

        # remove from consideration any jobs already in the queue. Checking this non persisted attribute
        # is better than querying the queue as cant tell if a job has just been taken off queue and is being processed
        # also by not being persistent, this attribute automatically allows queued jobs from last session to be considered
        # for requeing
        interesting_jobs = [j for j in jobs if not j.been_queued]
        # status that correspond to a ganga 'completed' or 'failed' (see DiracCommands.status(id))
        # if backend status is these then the job should be on the queue
        requeue_dirac_status = {'Completed': 'completed',
                                'Done': 'completed',
                                'Failed': 'failed',
                                'Deleted': 'failed',
                                'Unknown: No status for Job': 'failed'}

        monitor_jobs = [j for j in interesting_jobs if j.backend.status not in requeue_dirac_status]
        requeue_jobs = [j for j in interesting_jobs if j.backend.status in requeue_dirac_status]

        logger.debug('Interesting jobs: ' + repr([j.fqid for j in interesting_jobs]))
        logger.debug('Monitor jobs    : ' + repr([j.fqid for j in monitor_jobs]))
        logger.debug('Requeue jobs    : ' + repr([j.fqid for j in requeue_jobs]))

        from Ganga.GPI import queues

        # requeue existing completed job
        for j in requeue_jobs:
            #            if j.backend.status in requeue_dirac_status:
            queues._monitoring_threadpool.add_function(DiracBase.job_finalisation,
                                                       args=(j, requeue_dirac_status[j.backend.status]),
                                                       priority=5, name="Job %s Finalizing" % j.fqid)
            j.been_queued = True

        # now that can submit in non_blocking mode, can see jobs in submitting
        # that have yet to be assigned an id so ignore them
        # NOT SURE THIS IS VALID NOW BULK SUBMISSION IS GONE
        # EVEN THOUGH COULD ADD queues.add(j.submit) WILL KEEP AN EYE ON IT
#        dirac_job_ids    = [ j.backend.id for j in monitor_jobs if j.backend.id is not None ]
        # Correction this did become a problem for a crashed session during
        # submit, see #104454
        dead_jobs = (j for j in monitor_jobs if j.backend.id is None)
        for d in dead_jobs:
            d.updateStatus('failed')
            if d.master is not None:
                d.master.updateMasterJobStatus()

        ganga_job_status = [j.status for j in monitor_jobs if j.backend.id is not None]
        dirac_job_ids = [j.backend.id for j in monitor_jobs if j.backend.id is not None]

        #logger.debug("GangaStatus: %s" % str(ganga_job_status))
        #logger.debug("diracJobIDs: %s" % str(dirac_job_ids))

        result = execute('status(%s)' % str(dirac_job_ids))

        if len(result) != len(ganga_job_status):
            logger.warning('Dirac monitoring failed fro %s, result = %s' % (
                str(dirac_job_ids), str(result)))
            return

        #logger.debug("%s, %s, %s" % (str(len(ganga_job_status)), str(len(dirac_job_ids)), str(len(result))))

        from Ganga.Core import monitoring_component

        thread_handled_states = ['completed', 'failed']
        for job, state, old_state in zip(monitor_jobs, result, ganga_job_status):
            if monitoring_component:
                if monitoring_component.should_stop():
                    break

            job.backend.statusInfo = state[0]
            job.backend.status = state[1]
            job.backend.actualCE = state[2]
            updated_dirac_status = state[3]
            try:
                job.backend.extraInfo = state[4]
            except Exception as err:
                logger.debug("gxception: %s" % str(err))
                pass
            logger.debug('Job status vector  : ' + job.fqid + ' : ' + repr(state))

            # Is this really catching a real problem?
            if job.status != old_state:
                logger.warning('User changed Ganga job status from %s -> %s' % (str(old_state), job.status))
                continue
            ####################

            if updated_dirac_status == job.status:
                continue

            if updated_dirac_status in thread_handled_states:
                if job.status != 'running':
                    DiracBase._getStateTime(job, 'running')
                    if job.status in ['removed', 'killed']:
                        continue
                    if (job.master and job.master.status in ['removed', 'killed']):
                        continue  # user changed it under us
                    job.updateStatus('running')
                    if job.master:
                        job.master.updateMasterJobStatus()

                queues._monitoring_threadpool.add_function(DiracBase.job_finalisation,
                                                           args=(job, updated_dirac_status),
                                                           priority=5, name="Job %s Finalizing" % job.fqid)
                job.been_queued = True

            else:
                DiracBase._getStateTime(job, updated_dirac_status)
                if job.status in ['removed', 'killed']:
                    continue
                if (job.master and job.master.status in ['removed', 'killed']):
                    continue  # user changed it under us
                job.updateStatus(updated_dirac_status)
                if job.master:
                    job.master.updateMasterJobStatus()