Beispiel #1
0
    def requeue_dirac_finished_jobs(requeue_jobs, finalised_statuses):
        """
        Method used to requeue jobs whih are in the finalized state of some form, finished/failed/etc
        Args:
            requeue_jobs (list): This is a list of the jobs which are to be requeued to be finalised
            finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running
        """

        # requeue existing completed job
        for j in requeue_jobs:
            if j.been_queued:
                continue

            if monitoring_component:
                if monitoring_component.should_stop():
                    break
            if not configDirac['serializeBackend']:
                getQueues()._monitoring_threadpool.add_function(
                    DiracBase.job_finalisation,
                    args=(j, finalised_statuses[j.backend.status]),
                    priority=5,
                    name="Job %s Finalizing" % j.fqid)
                j.been_queued = True
            else:
                DiracBase.job_finalisation(
                    j, finalised_statuses[j.backend.status])
Beispiel #2
0
    def requeue_dirac_finished_jobs(requeue_jobs, finalised_statuses):
        """
        Method used to requeue jobs whih are in the finalized state of some form, finished/failed/etc
        Args:
            requeue_jobs (list): This is a list of the jobs which are to be requeued to be finalised
            finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running
        """

        from Ganga.Core import monitoring_component

        # requeue existing completed job
        for j in requeue_jobs:
            if j.been_queued:
                continue

            if monitoring_component:
                if monitoring_component.should_stop():
                    break
            if not configDirac['serializeBackend']:
                getQueues()._monitoring_threadpool.add_function(DiracBase.job_finalisation,
                                                           args=(j, finalised_statuses[j.backend.status]),
                                                           priority=5, name="Job %s Finalizing" % j.fqid)
                j.been_queued = True
            else:
                DiracBase.job_finalisation(j, finalised_statuses[j.backend.status])
Beispiel #3
0
    def updateMonitoringInformation(jobs):
        """Check the status of jobs and retrieve output sandboxes"""
        from Ganga.Core import monitoring_component
        dirac_job_ids = []
        for j in jobs:
            dirac_job_ids.append(j.backend.id)
        global dirac_monitoring_server
        global dirac_monitoring_is_active
        if not dirac_monitoring_server.proxy.isValid():
            if dirac_monitoring_is_active:
                logger.warning('DIRAC monitoring inactive (no valid proxy '\
                               'found).')
            dirac_monitoring_is_active = False
            return
        else:
            dirac_monitoring_is_active = True
        cmd = 'result = DiracCommands.status(%s)' % str(dirac_job_ids)
        result = dirac_monitoring_server.execute(cmd)
        if type(result) != type([]):
            logger.warning('DIRAC monitoring failed: %s' % str(result))
            return

        for i in range(0, len(jobs)):
            if monitoring_component:
                if monitoring_component.should_stop(): break
            j = jobs[i]
            j.backend.statusInfo = result[i][0]
            j.backend.status = result[i][1]
            j.backend.actualCE = result[i][2]
            cmd = 'result = DiracCommands.normCPUTime(%d)' % j.backend.id
            j.backend.normCPUTime = dirac_monitoring_server.execute(cmd)
            if result[i][3] != 'completed' and result[i][3] != j.status:
                j.updateStatus(result[i][3])
            if result[i][3] == 'completed':
                j.updateStatus('completing')
                ok = j.backend._getOutputSandbox(dirac_monitoring_server)
                if ok and j.outputdata:
                    j.backend._getOutputDataLFNs(dirac_monitoring_server, True)
                if not ok: j.updateStatus('failed')
                else: j.updateStatus('completed')
            if result[i][3] == 'failed':
                if configBoss['failed_sandbox_download']:
                    j.backend._getOutputSandbox(dirac_monitoring_server)
                pass
Beispiel #4
0
 def updateMonitoringInformation(jobs):
     """Check the status of jobs and retrieve output sandboxes"""
     from Ganga.Core import monitoring_component
     dirac_job_ids = []
     for j in jobs: dirac_job_ids.append(j.backend.id)
     global dirac_monitoring_server
     global dirac_monitoring_is_active
     if not dirac_monitoring_server.proxy.isValid():
         if dirac_monitoring_is_active:
             logger.warning('DIRAC monitoring inactive (no valid proxy '\
                            'found).')
         dirac_monitoring_is_active = False
         return
     else:
         dirac_monitoring_is_active = True
     cmd = 'result = DiracCommands.status(%s)' % str(dirac_job_ids)
     result = dirac_monitoring_server.execute(cmd)
     if type(result) != type([]):
         logger.warning('DIRAC monitoring failed: %s' % str(result))
         return
             
     for i in range(0,len(jobs)):
         if monitoring_component:
             if monitoring_component.should_stop(): break
         j = jobs[i]
         j.backend.statusInfo = result[i][0]
         j.backend.status = result[i][1]
         j.backend.actualCE = result[i][2]
         cmd = 'result = DiracCommands.normCPUTime(%d)' % j.backend.id
         j.backend.normCPUTime = dirac_monitoring_server.execute(cmd)
         if result[i][3] != 'completed' and result[i][3] != j.status:
             j.updateStatus(result[i][3])
         if result[i][3] == 'completed':
             j.updateStatus('completing')
             ok = j.backend._getOutputSandbox(dirac_monitoring_server)
             if ok and j.outputdata:
                 j.backend._getOutputDataLFNs(dirac_monitoring_server,True)
             if not ok: j.updateStatus('failed')
             else: j.updateStatus('completed')
         if result[i][3] == 'failed':
             if configBoss['failed_sandbox_download']:
                 j.backend._getOutputSandbox(dirac_monitoring_server)
             pass                       
Beispiel #5
0
    def updateMonitoringInformation(_jobs):
        """Check the status of jobs and retrieve output sandboxes"""
        # Only those jobs in 'submitted','running' are passed in here for checking
        # if however they have already completed in Dirac they may have been put on queue
        # for processing from last time. These should be put back on queue without
        # querying dirac again. Their signature is status = running and job.backend.status
        # already set to Done or Failed etc.

        jobs = [stripProxy(j) for j in _jobs]

        logger = getLogger()

        # make sure proxy is valid
        if not _proxyValid():
            if DiracBase.dirac_monitoring_is_active:
                logger.warning('DIRAC monitoring inactive (no valid proxy found).')
                DiracBase.dirac_monitoring_is_active = False
            return
        else:
            DiracBase.dirac_monitoring_is_active = True

        # remove from consideration any jobs already in the queue. Checking this non persisted attribute
        # is better than querying the queue as cant tell if a job has just been taken off queue and is being processed
        # also by not being persistent, this attribute automatically allows queued jobs from last session to be considered
        # for requeing
        interesting_jobs = [j for j in jobs if not j.been_queued]
        # status that correspond to a ganga 'completed' or 'failed' (see DiracCommands.status(id))
        # if backend status is these then the job should be on the queue
        requeue_dirac_status = {'Completed': 'completed',
                                'Done': 'completed',
                                'Failed': 'failed',
                                'Deleted': 'failed',
                                'Unknown: No status for Job': 'failed'}

        monitor_jobs = [j for j in interesting_jobs if j.backend.status not in requeue_dirac_status]
        requeue_jobs = [j for j in interesting_jobs if j.backend.status in requeue_dirac_status]

        logger.debug('Interesting jobs: ' + repr([j.fqid for j in interesting_jobs]))
        logger.debug('Monitor jobs    : ' + repr([j.fqid for j in monitor_jobs]))
        logger.debug('Requeue jobs    : ' + repr([j.fqid for j in requeue_jobs]))

        from Ganga.GPI import queues

        # requeue existing completed job
        for j in requeue_jobs:
            #            if j.backend.status in requeue_dirac_status:
            queues._monitoring_threadpool.add_function(DiracBase.job_finalisation,
                                                       args=(j, requeue_dirac_status[j.backend.status]),
                                                       priority=5, name="Job %s Finalizing" % j.fqid)
            j.been_queued = True

        # now that can submit in non_blocking mode, can see jobs in submitting
        # that have yet to be assigned an id so ignore them
        # NOT SURE THIS IS VALID NOW BULK SUBMISSION IS GONE
        # EVEN THOUGH COULD ADD queues.add(j.submit) WILL KEEP AN EYE ON IT
#        dirac_job_ids    = [ j.backend.id for j in monitor_jobs if j.backend.id is not None ]
        # Correction this did become a problem for a crashed session during
        # submit, see #104454
        dead_jobs = (j for j in monitor_jobs if j.backend.id is None)
        for d in dead_jobs:
            d.updateStatus('failed')
            if d.master is not None:
                d.master.updateMasterJobStatus()

        ganga_job_status = [j.status for j in monitor_jobs if j.backend.id is not None]
        dirac_job_ids = [j.backend.id for j in monitor_jobs if j.backend.id is not None]

        #logger.debug("GangaStatus: %s" % str(ganga_job_status))
        #logger.debug("diracJobIDs: %s" % str(dirac_job_ids))

        result = execute('status(%s)' % str(dirac_job_ids))

        if len(result) != len(ganga_job_status):
            logger.warning('Dirac monitoring failed fro %s, result = %s' % (
                str(dirac_job_ids), str(result)))
            return

        #logger.debug("%s, %s, %s" % (str(len(ganga_job_status)), str(len(dirac_job_ids)), str(len(result))))

        from Ganga.Core import monitoring_component

        thread_handled_states = ['completed', 'failed']
        for job, state, old_state in zip(monitor_jobs, result, ganga_job_status):
            if monitoring_component:
                if monitoring_component.should_stop():
                    break

            job.backend.statusInfo = state[0]
            job.backend.status = state[1]
            job.backend.actualCE = state[2]
            updated_dirac_status = state[3]
            try:
                job.backend.extraInfo = state[4]
            except Exception as err:
                logger.debug("gxception: %s" % str(err))
                pass
            logger.debug('Job status vector  : ' + job.fqid + ' : ' + repr(state))

            # Is this really catching a real problem?
            if job.status != old_state:
                logger.warning('User changed Ganga job status from %s -> %s' % (str(old_state), job.status))
                continue
            ####################

            if updated_dirac_status == job.status:
                continue

            if updated_dirac_status in thread_handled_states:
                if job.status != 'running':
                    DiracBase._getStateTime(job, 'running')
                    if job.status in ['removed', 'killed']:
                        continue
                    if (job.master and job.master.status in ['removed', 'killed']):
                        continue  # user changed it under us
                    job.updateStatus('running')
                    if job.master:
                        job.master.updateMasterJobStatus()

                queues._monitoring_threadpool.add_function(DiracBase.job_finalisation,
                                                           args=(job, updated_dirac_status),
                                                           priority=5, name="Job %s Finalizing" % job.fqid)
                job.been_queued = True

            else:
                DiracBase._getStateTime(job, updated_dirac_status)
                if job.status in ['removed', 'killed']:
                    continue
                if (job.master and job.master.status in ['removed', 'killed']):
                    continue  # user changed it under us
                job.updateStatus(updated_dirac_status)
                if job.master:
                    job.master.updateMasterJobStatus()
Beispiel #6
0
    def monitor_dirac_running_jobs(monitor_jobs, finalised_statuses):
        """
        Method to update the configuration of jobs which are in a submitted/running state in Ganga&Dirac
        Args:
            monitor_jobs (list): Jobs which are to be monitored for their status change
            finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running
        """

        # now that can submit in non_blocking mode, can see jobs in submitting
        # that have yet to be assigned an id so ignore them
        # NOT SURE THIS IS VALID NOW BULK SUBMISSION IS GONE
        # EVEN THOUGH COULD ADD queues.add(j.submit) WILL KEEP AN EYE ON IT
        # dirac_job_ids    = [ j.backend.id for j in monitor_jobs if j.backend.id is not None ]
        # Correction this did become a problem for a crashed session during
        # submit, see #104454
        dead_jobs = (j for j in monitor_jobs if j.backend.id is None)
        for d in dead_jobs:
            d.updateStatus('failed')
            if d.master is not None:
                d.master.updateMasterJobStatus()

        ganga_job_status = [
            j.status for j in monitor_jobs if j.backend.id is not None
        ]
        dirac_job_ids = [
            j.backend.id for j in monitor_jobs if j.backend.id is not None
        ]

        logger.debug("GangaStatus: %s" % str(ganga_job_status))
        logger.debug("diracJobIDs: %s" % str(dirac_job_ids))

        if not dirac_job_ids:
            ## Nothing to do here stop bugging DIRAC about it!
            ## Everything else beyond here in the function depends on some ids present here, no ids means we can stop.
            return

        statusmapping = configDirac['statusmapping']

        result, bulk_state_result = execute(
            'monitorJobs(%s, %s)' % (repr(dirac_job_ids), repr(statusmapping)),
            cred_req=monitor_jobs[0].backend.credential_requirements)

        #result = results[0]
        #bulk_state_result = results[1]

        if len(result) != len(ganga_job_status):
            logger.warning('Dirac monitoring failed for %s, result = %s' %
                           (str(dirac_job_ids), str(result)))
            logger.warning("Results: %s" % str(result))
            return

        requeue_job_list = []
        jobStateDict = {}

        jobs_to_update = {}
        master_jobs_to_update = []

        thread_handled_states = ['completed', 'failed']
        for job, state, old_state in zip(monitor_jobs, result,
                                         ganga_job_status):
            if monitoring_component:
                if monitoring_component.should_stop():
                    break

            if job.been_queued:
                continue

            job.backend.statusInfo = state[0]
            job.backend.status = state[1]
            job.backend.actualCE = state[2]
            updated_dirac_status = state[3]
            try:
                job.backend.extraInfo = state[4]
            except Exception as err:
                logger.debug("gexception: %s" % str(err))
                pass
            logger.debug('Job status vector  : ' + job.fqid + ' : ' +
                         repr(state))

            if updated_dirac_status not in jobStateDict:
                jobStateDict[updated_dirac_status] = []
            jobStateDict[updated_dirac_status].append(job)

            if job.backend.status in finalised_statuses:
                if job.status != 'running':
                    if job.status in ['removed', 'killed']:
                        requeue_job_list.append(job)
                    elif (job.master
                          and job.master.status in ['removed', 'killed']):
                        continue  # user changed it under us
                    else:
                        if 'running' not in jobs_to_update:
                            jobs_to_update['running'] = []
                        jobs_to_update['running'].append(job)
                        if job.master:
                            if job.master not in master_jobs_to_update:
                                master_jobs_to_update.append(job.master)
                        requeue_job_list.append(job)

            else:
                if job.status in ['removed', 'killed']:
                    continue
                if (job.master and job.master.status in ['removed', 'killed']):
                    continue  # user changed it under us
                if job.status != updated_dirac_status:
                    if updated_dirac_status not in jobs_to_update:
                        jobs_to_update[updated_dirac_status] = []
                    jobs_to_update[updated_dirac_status].append(job)
                    if job.master:
                        if job.master not in master_jobs_to_update:
                            master_jobs_to_update.append(job.master)

        DiracBase._bulk_updateStateTime(jobStateDict, bulk_state_result)

        for status in jobs_to_update:
            for job in jobs_to_update[status]:
                job.updateStatus(status, update_master=False)

        for j in master_jobs_to_update:
            j.updateMasterJobStatus()

        DiracBase.requeue_dirac_finished_jobs(requeue_job_list,
                                              finalised_statuses)
Beispiel #7
0
    def monitor_dirac_running_jobs(monitor_jobs, finalised_statuses):
        """
        Method to update the configuration of jobs which are in a submitted/running state in Ganga&Dirac
        Args:
            monitor_jobs (list): Jobs which are to be monitored for their status change
            finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running
        """

        # now that can submit in non_blocking mode, can see jobs in submitting
        # that have yet to be assigned an id so ignore them
        # NOT SURE THIS IS VALID NOW BULK SUBMISSION IS GONE
        # EVEN THOUGH COULD ADD queues.add(j.submit) WILL KEEP AN EYE ON IT
        # dirac_job_ids    = [ j.backend.id for j in monitor_jobs if j.backend.id is not None ]
        # Correction this did become a problem for a crashed session during
        # submit, see #104454
        dead_jobs = (j for j in monitor_jobs if j.backend.id is None)
        for d in dead_jobs:
            d.updateStatus('failed')
            if d.master is not None:
                d.master.updateMasterJobStatus()

        ganga_job_status = [j.status for j in monitor_jobs if j.backend.id is not None]
        dirac_job_ids = [j.backend.id for j in monitor_jobs if j.backend.id is not None]

        logger.debug("GangaStatus: %s" % str(ganga_job_status))
        logger.debug("diracJobIDs: %s" % str(dirac_job_ids))

        if not dirac_job_ids:
            ## Nothing to do here stop bugging DIRAC about it!
            ## Everything else beyond here in the function depends on some ids present here, no ids means we can stop.
            return

        statusmapping = configDirac['statusmapping']

        result, bulk_state_result = execute('monitorJobs(%s, %s)' %( repr(dirac_job_ids), repr(statusmapping)))

        if not DiracBase.checkDiracProxy():
            return

        #result = results[0]
        #bulk_state_result = results[1]

        if len(result) != len(ganga_job_status):
            logger.warning('Dirac monitoring failed for %s, result = %s' % (str(dirac_job_ids), str(result)))
            logger.warning("Results: %s" % str(results))
            return

        from Ganga.Core import monitoring_component

        requeue_job_list = []
        jobStateDict = {}

        jobs_to_update = {}
        master_jobs_to_update = []

        thread_handled_states = ['completed', 'failed']
        for job, state, old_state in zip(monitor_jobs, result, ganga_job_status):
            if monitoring_component:
                if monitoring_component.should_stop():
                    break

            if job.been_queued:
                continue

            job.backend.statusInfo = state[0]
            job.backend.status = state[1]
            job.backend.actualCE = state[2]
            updated_dirac_status = state[3]
            try:
                job.backend.extraInfo = state[4]
            except Exception as err:
                logger.debug("gexception: %s" % str(err))
                pass
            logger.debug('Job status vector  : ' + job.fqid + ' : ' + repr(state))

            if updated_dirac_status not in jobStateDict:
                jobStateDict[updated_dirac_status] = []
            jobStateDict[updated_dirac_status].append(job)

            if job.backend.status in finalised_statuses:
                if job.status != 'running':
                    if job.status in ['removed', 'killed']:
                        requeue_job_list.append(job)
                    elif (job.master and job.master.status in ['removed', 'killed']):
                        continue  # user changed it under us
                    else:
                        if 'running' not in jobs_to_update:
                            jobs_to_update['running'] = []
                        jobs_to_update['running'].append(job)
                        if job.master:
                            if job.master not in master_jobs_to_update:
                                master_jobs_to_update.append(job.master)
                        requeue_job_list.append(job)

            else:
                if job.status in ['removed', 'killed']:
                    continue
                if (job.master and job.master.status in ['removed', 'killed']):
                    continue  # user changed it under us
                if job.status != updated_dirac_status:
                    if updated_dirac_status not in jobs_to_update:
                        jobs_to_update[updated_dirac_status] = []
                    jobs_to_update[updated_dirac_status].append(job)
                    if job.master:
                        if job.master not in master_jobs_to_update:
                            master_jobs_to_update.append(job.master)

        DiracBase._bulk_updateStateTime(jobStateDict, bulk_state_result)

        for status in jobs_to_update:
            for job in jobs_to_update[status]:
                job.updateStatus(status, update_master=False)

        for j in master_jobs_to_update:
            j.updateMasterJobStatus()

        DiracBase.requeue_dirac_finished_jobs(requeue_job_list, finalised_statuses)