def checkDiracProxy(): # make sure proxy is valid if not _proxyValid(shouldRenew = False, shouldRaise = False): if DiracBase.dirac_monitoring_is_active is True: logger.warning('DIRAC monitoring inactive (no valid proxy found).') logger.warning('Type: \'gridProxy.renew()\' to (re-)activate') DiracBase.dirac_monitoring_is_active = False else: DiracBase.dirac_monitoring_is_active = True return DiracBase.dirac_monitoring_is_active
def updateMonitoringInformation(_jobs): """Check the status of jobs and retrieve output sandboxes""" # Only those jobs in 'submitted','running' are passed in here for checking # if however they have already completed in Dirac they may have been put on queue # for processing from last time. These should be put back on queue without # querying dirac again. Their signature is status = running and job.backend.status # already set to Done or Failed etc. jobs = [stripProxy(j) for j in _jobs] logger = getLogger() # make sure proxy is valid if not _proxyValid(): if DiracBase.dirac_monitoring_is_active: logger.warning('DIRAC monitoring inactive (no valid proxy found).') DiracBase.dirac_monitoring_is_active = False return else: DiracBase.dirac_monitoring_is_active = True # remove from consideration any jobs already in the queue. Checking this non persisted attribute # is better than querying the queue as cant tell if a job has just been taken off queue and is being processed # also by not being persistent, this attribute automatically allows queued jobs from last session to be considered # for requeing interesting_jobs = [j for j in jobs if not j.been_queued] # status that correspond to a ganga 'completed' or 'failed' (see DiracCommands.status(id)) # if backend status is these then the job should be on the queue requeue_dirac_status = {'Completed': 'completed', 'Done': 'completed', 'Failed': 'failed', 'Deleted': 'failed', 'Unknown: No status for Job': 'failed'} monitor_jobs = [j for j in interesting_jobs if j.backend.status not in requeue_dirac_status] requeue_jobs = [j for j in interesting_jobs if j.backend.status in requeue_dirac_status] logger.debug('Interesting jobs: ' + repr([j.fqid for j in interesting_jobs])) logger.debug('Monitor jobs : ' + repr([j.fqid for j in monitor_jobs])) logger.debug('Requeue jobs : ' + repr([j.fqid for j in requeue_jobs])) from Ganga.GPI import queues # requeue existing completed job for j in requeue_jobs: # if j.backend.status in requeue_dirac_status: queues._monitoring_threadpool.add_function(DiracBase.job_finalisation, args=(j, requeue_dirac_status[j.backend.status]), priority=5, name="Job %s Finalizing" % j.fqid) j.been_queued = True # now that can submit in non_blocking mode, can see jobs in submitting # that have yet to be assigned an id so ignore them # NOT SURE THIS IS VALID NOW BULK SUBMISSION IS GONE # EVEN THOUGH COULD ADD queues.add(j.submit) WILL KEEP AN EYE ON IT # dirac_job_ids = [ j.backend.id for j in monitor_jobs if j.backend.id is not None ] # Correction this did become a problem for a crashed session during # submit, see #104454 dead_jobs = (j for j in monitor_jobs if j.backend.id is None) for d in dead_jobs: d.updateStatus('failed') if d.master is not None: d.master.updateMasterJobStatus() ganga_job_status = [j.status for j in monitor_jobs if j.backend.id is not None] dirac_job_ids = [j.backend.id for j in monitor_jobs if j.backend.id is not None] #logger.debug("GangaStatus: %s" % str(ganga_job_status)) #logger.debug("diracJobIDs: %s" % str(dirac_job_ids)) result = execute('status(%s)' % str(dirac_job_ids)) if len(result) != len(ganga_job_status): logger.warning('Dirac monitoring failed fro %s, result = %s' % ( str(dirac_job_ids), str(result))) return #logger.debug("%s, %s, %s" % (str(len(ganga_job_status)), str(len(dirac_job_ids)), str(len(result)))) from Ganga.Core import monitoring_component thread_handled_states = ['completed', 'failed'] for job, state, old_state in zip(monitor_jobs, result, ganga_job_status): if monitoring_component: if monitoring_component.should_stop(): break job.backend.statusInfo = state[0] job.backend.status = state[1] job.backend.actualCE = state[2] updated_dirac_status = state[3] try: job.backend.extraInfo = state[4] except Exception as err: logger.debug("gxception: %s" % str(err)) pass logger.debug('Job status vector : ' + job.fqid + ' : ' + repr(state)) # Is this really catching a real problem? if job.status != old_state: logger.warning('User changed Ganga job status from %s -> %s' % (str(old_state), job.status)) continue #################### if updated_dirac_status == job.status: continue if updated_dirac_status in thread_handled_states: if job.status != 'running': DiracBase._getStateTime(job, 'running') if job.status in ['removed', 'killed']: continue if (job.master and job.master.status in ['removed', 'killed']): continue # user changed it under us job.updateStatus('running') if job.master: job.master.updateMasterJobStatus() queues._monitoring_threadpool.add_function(DiracBase.job_finalisation, args=(job, updated_dirac_status), priority=5, name="Job %s Finalizing" % job.fqid) job.been_queued = True else: DiracBase._getStateTime(job, updated_dirac_status) if job.status in ['removed', 'killed']: continue if (job.master and job.master.status in ['removed', 'killed']): continue # user changed it under us job.updateStatus(updated_dirac_status) if job.master: job.master.updateMasterJobStatus()