Esempio n. 1
0
 def _getVerificationSubmitThrottle(self, submitCount):
     jobsActive = self.jobDB.getJobsN(ClassSelector(JobClass.PROCESSING))
     jobsSuccess = self.jobDB.getJobsN(ClassSelector(JobClass.SUCCESS))
     jobsDone = self.jobDB.getJobsN(ClassSelector(JobClass.PROCESSED))
     jobsTotal = jobsDone + jobsActive
     verifyIndex = bisect.bisect_left(self._verifyChunks, jobsTotal)
     try:
         successRatio = jobsSuccess * 1.0 / self._verifyChunks[verifyIndex]
         goal = self._verifyChunks[verifyIndex] * self._verifyThresh[
             verifyIndex]
         if self._verifyChunks[verifyIndex] - jobsDone + jobsSuccess < goal:
             if not self._unreachableGoal:
                 self._log_user_time.warning(
                     'All remaining jobs are vetoed by an unachieveable verification goal!'
                 )
                 self._log_user_time.info(
                     'Current goal: %d successful jobs out of %d', goal,
                     self._verifyChunks[verifyIndex])
                 self._unreachableGoal = True
             return 0
         if successRatio < self._verifyThresh[verifyIndex]:
             return min(submitCount,
                        self._verifyChunks[verifyIndex] - jobsTotal)
         else:
             return min(submitCount,
                        self._verifyChunks[verifyIndex + 1] - jobsTotal)
     except IndexError:
         self._log_user_time.debug('All verification chunks passed')
         self._log_user_time.debug(
             'Verification submission throttle disabled')
         self._verify = False
         return submitCount
Esempio n. 2
0
    def check(self, wms):
        jobList = self._sample(
            self.jobDB.getJobs(ClassSelector(JobClass.PROCESSING)),
            utils.QM(self._chunks_enabled, self._chunks_check, -1))

        # Check jobs in the joblist and return changes, timeouts and successfully reported jobs
        (change, timeoutList, reported) = self._checkJobList(wms, jobList)
        unreported = len(jobList) - len(reported)
        if unreported > 0:
            self._log_user_time.critical(
                '%d job(s) did not report their status!', unreported)
        if change is None:  # neither True or False => abort
            return False

        # Cancel jobs which took too long
        if len(timeoutList):
            change = True
            self._log_user.warning('Timeout for the following jobs:')
            self.cancel(wms, timeoutList, interactive=False, showJobs=True)

        # Process task interventions
        self._processIntervention(wms, self._task.getIntervention())

        # Quit when all jobs are finished
        if self.jobDB.getJobsN(ClassSelector(JobClass.ENDSTATE)) == len(
                self.jobDB):
            self._logDisabledJobs()
            self._eventhandler.onTaskFinish(len(self.jobDB))
            if self._task.canFinish():
                self._log_user_time.info(
                    'Task successfully completed. Quitting grid-control!')
                utils.abort(True)

        return change
Esempio n. 3
0
 def display(self):
     self._bar.update(
         len(self._jobDB.getJobs(ClassSelector(JobClass.SUCCESS))),
         len(self._jobDB.getJobs(ClassSelector(JobClass.ATWMS))),
         len(self._jobDB.getJobs(ClassSelector(JobClass.RUNNING_DONE))),
         len(self._jobDB.getJobs(ClassSelector(JobClass.FAILING))))
     sys.stdout.write(str(self._bar) + '\n')
Esempio n. 4
0
    def _process_intervention(self, task, wms):
        # Process changes of job states requested by task module
        resetable_state_list = [
            Job.INIT, Job.DISABLED, Job.ABORTED, Job.CANCELLED, Job.DONE,
            Job.FAILED, Job.SUCCESS
        ]

        def _reset_state(jobnum_list, state_new):
            jobnum_listet = set(jobnum_list)
            for jobnum in jobnum_list:
                job_obj = self.job_db.get_job_persistent(jobnum)
                if job_obj.state in resetable_state_list:
                    self._update(task, job_obj, jobnum, state_new)
                    jobnum_listet.remove(jobnum)
                    job_obj.attempt = 0

            if len(jobnum_listet) > 0:
                raise JobError(
                    'For the following jobs it was not possible to reset the state to %s:\n%s'
                    % (Job.enum2str(state_new),
                       str.join(', ', imap(str, jobnum_listet))))

        (redo, disable, size_change) = task.get_intervention()
        if (not redo) and (not disable) and (not size_change):
            return
        self._log.log_time(
            logging.INFO,
            'The task module has requested changes to the job database')
        max_job_len_new = self._get_max_jobs(task)
        applied_change = False
        if max_job_len_new != len(self.job_db):
            self._log.log_time(logging.INFO,
                               'Number of jobs changed from %d to %d',
                               len(self.job_db), max_job_len_new)
            self.job_db.set_job_limit(max_job_len_new)
            applied_change = True
        if redo:
            self._cancel(task,
                         wms,
                         self.job_db.get_job_list(
                             ClassSelector(JobClass.PROCESSING), redo),
                         interactive=False,
                         show_jobs=True)
            _reset_state(redo, Job.INIT)
            applied_change = True
        if disable:
            self._cancel(task,
                         wms,
                         self.job_db.get_job_list(
                             ClassSelector(JobClass.PROCESSING), disable),
                         interactive=False,
                         show_jobs=True)
            _reset_state(disable, Job.DISABLED)
            applied_change = True
        if applied_change:
            self._log.log_time(logging.INFO,
                               'All requested changes are applied')
Esempio n. 5
0
    def _processIntervention(self, wms, jobChanges):
        def resetState(jobs, newState):
            jobSet = set(jobs)
            for jobNum in jobs:
                jobObj = self.jobDB.get(jobNum)
                if jobObj and jobObj.state in [
                        Job.INIT, Job.DISABLED, Job.ABORTED, Job.CANCELLED,
                        Job.DONE, Job.FAILED, Job.SUCCESS
                ]:
                    self._update(jobObj, jobNum, newState)
                    jobSet.remove(jobNum)
                    jobObj.attempt = 0
            if len(jobSet) > 0:
                output = (Job.enum2str(newState),
                          str.join(', ', imap(str, jobSet)))
                raise JobError(
                    'For the following jobs it was not possible to reset the state to %s:\n%s'
                    % output)

        if jobChanges:
            (redo, disable, sizeChange) = jobChanges
            if (redo == []) and (disable == []) and (sizeChange is False):
                return
            self._log_user_time.info(
                'The task module has requested changes to the job database')
            newMaxJobs = self.getMaxJobs(self._task)
            applied_change = False
            if newMaxJobs != self.jobDB.jobLimit:
                self._log_user_time.info(
                    'Number of jobs changed from %d to %d', len(self.jobDB),
                    newMaxJobs)
                self.jobDB.jobLimit = newMaxJobs
                applied_change = True
            if redo:
                self.cancel(wms,
                            self.jobDB.getJobs(
                                ClassSelector(JobClass.PROCESSING), redo),
                            interactive=False,
                            showJobs=True)
                resetState(redo, Job.INIT)
                applied_change = True
            if disable:
                self.cancel(wms,
                            self.jobDB.getJobs(
                                ClassSelector(JobClass.PROCESSING), disable),
                            interactive=False,
                            showJobs=True)
                resetState(disable, Job.DISABLED)
                applied_change = True
            if applied_change:
                self._log_user_time.info('All requested changes are applied')
Esempio n. 6
0
 def delete(self, wms, select):
     selector = AndJobSelector(ClassSelector(JobClass.PROCESSING),
                               JobSelector.create(select, task=self._task))
     jobs = self.jobDB.getJobs(selector)
     if jobs:
         self._log_user.warning('Cancelling the following jobs:')
         self.cancel(wms, jobs, interactive=True, showJobs=True)
Esempio n. 7
0
    def retrieve(self, wms):
        change = False
        jobList = self._sample(
            self.jobDB.getJobs(ClassSelector(JobClass.DONE)),
            utils.QM(self._chunks_enabled, self._chunks_retrieve, -1))

        for (jobNum, retCode, data,
             outputdir) in wms.retrieveJobs(self._wmsArgs(jobList)):
            jobObj = self.jobDB.get(jobNum)
            if jobObj is None:
                continue

            if retCode == 0:
                state = Job.SUCCESS
            elif retCode == 107:  # set ABORTED instead of FAILED for errorcode 107
                state = Job.ABORTED
            else:
                state = Job.FAILED

            if state == Job.SUCCESS:
                if not self._outputProcessor.process(outputdir):
                    retCode = 108
                    state = Job.FAILED

            if state != jobObj.state:
                change = True
                jobObj.set('retcode', retCode)
                jobObj.set('runtime', data.get('TIME', -1))
                self._update(jobObj, jobNum, state)
                self._eventhandler.onJobOutput(wms, jobObj, jobNum, retCode)

            if utils.abort():
                return False

        return change
Esempio n. 8
0
 def delete(self, task, wms, select):
     selector = AndJobSelector(ClassSelector(JobClass.PROCESSING),
                               JobSelector.create(select, task=task))
     jobs = self.job_db.get_job_list(selector)
     if jobs:
         self._log.warning('Cancelling the following jobs:')
         self.cancel(wms,
                     jobs,
                     interactive=self._interactive_delete,
                     show_jobs=True)
Esempio n. 9
0
 def _submit_get_jobs_throttled(self, job_len_submit):
     # Verification heuristic - check whether enough jobs have succeeded before submitting more
     job_len_active = self.job_db.get_job_len(
         ClassSelector(JobClass.PROCESSING))
     job_len_success = self.job_db.get_job_len(
         ClassSelector(JobClass.SUCCESS))
     job_len_done = self.job_db.get_job_len(
         ClassSelector(JobClass.PROCESSED))
     job_len_total = job_len_done + job_len_active
     verify_idx = bisect.bisect_left(self._verify_chunk_list, job_len_total)
     try:
         success_ratio = job_len_success * 1.0 / self._verify_chunk_list[
             verify_idx]
         goal = self._verify_chunk_list[
             verify_idx] * self._verify_threshold_list[verify_idx]
         if self._verify_chunk_list[
                 verify_idx] - job_len_done + job_len_success < goal:
             if not self._unreachable_goal_flag:
                 self._log.log_time(
                     logging.WARNING,
                     'All remaining jobs are vetoed by an unachieveable verification goal!'
                 )
                 self._log.log_time(
                     logging.INFO,
                     'Current goal: %d successful jobs out of %d', goal,
                     self._verify_chunk_list[verify_idx])
                 self._unreachable_goal_flag = True
             return 0
         if success_ratio < self._verify_threshold_list[verify_idx]:
             return min(job_len_submit,
                        self._verify_chunk_list[verify_idx] - job_len_total)
         else:
             return min(
                 job_len_submit,
                 self._verify_chunk_list[verify_idx + 1] - job_len_total)
     except IndexError:
         clear_current_exception()
         self._log.log_time(logging.DEBUG, 'All verification chunks passed')
         self._log.log_time(logging.DEBUG,
                            'Verification submission throttle disabled')
         self._verify = False
         return job_len_submit
Esempio n. 10
0
    def _getSubmissionJobs(self, maxsample):
        # Get list of submittable jobs
        readyList = self.jobDB.getJobs(ClassSelector(JobClass.READY))
        retryOK = readyList
        defaultJob = Job()
        if self._job_retries >= 0:
            retryOK = lfilter(
                lambda x: self.jobDB.get(x, defaultJob).attempt - 1 < self.
                _job_retries, readyList)
        modOK = lfilter(self._task.canSubmit, readyList)
        jobList = set.intersection(set(retryOK), set(modOK))

        if self._showBlocker and readyList and not jobList:  # No submission but ready jobs
            err = []
            err += utils.QM((len(retryOK) > 0) and (len(modOK) == 0), [],
                            ['have hit their maximum number of retries'])
            err += utils.QM((len(retryOK) == 0) and (len(modOK) > 0), [],
                            ['are vetoed by the task module'])
            self._log_user_time.warning(
                'All remaining jobs %s!',
                str.join(utils.QM(retryOK or modOK, ' or ', ' and '), err))
        self._showBlocker = not (len(readyList) > 0 and len(jobList) == 0)

        # Determine number of jobs to submit
        submit = len(jobList)
        if self._njobs_inqueue > 0:
            submit = min(
                submit, self._njobs_inqueue -
                self.jobDB.getJobsN(ClassSelector(JobClass.ATWMS)))
        if self._njobs_inflight > 0:
            submit = min(
                submit, self._njobs_inflight -
                self.jobDB.getJobsN(ClassSelector(JobClass.PROCESSING)))
        if self._chunks_enabled and (maxsample > 0):
            submit = min(submit, maxsample)
        submit = max(submit, 0)

        if self._do_shuffle:
            return self._sample(jobList, submit)
        return sorted(jobList)[:submit]
Esempio n. 11
0
    def check(self, task, wms):
        jobnum_list = self._sample(
            self.job_db.get_job_list(ClassSelector(JobClass.PROCESSING)),
            self._get_chunk_size(self._chunks_check))

        # Check jobs in the jobnum_list and return changes, timeouts and successfully reported jobs
        (change, jobnum_list_timeout,
         reported) = self._check_get_jobnum_list(task, wms, jobnum_list)
        unreported = len(jobnum_list) - len(reported)
        if unreported > 0:
            self._log.log_time(logging.CRITICAL,
                               '%d job(s) did not report their status!',
                               unreported)
        if change is None:  # neither True or False => abort
            return False

        # Cancel jobs which took too long
        if len(jobnum_list_timeout):
            change = True
            self._log.warning('Timeout for the following jobs:')
            self._cancel(task,
                         wms,
                         jobnum_list_timeout,
                         interactive=False,
                         show_jobs=True)

        # Process task interventions
        self._process_intervention(task, wms)

        # Quit when all jobs are finished
        if self.job_db.get_job_len(ClassSelector(JobClass.ENDSTATE)) == len(
                self.job_db):
            self._log_disabled_jobs()
            if task.can_finish():
                self._local_event_handler.on_task_finish(
                    task, len(self.job_db))
                abort(True)

        return change
Esempio n. 12
0
    def _submit_get_jobs(self, task):
        # Get list of submittable jobs
        jobnum_list_ready = self.job_db.get_job_list(
            ClassSelector(JobClass.SUBMIT_CANDIDATES))
        (n_mod_ok, n_retry_ok,
         jobnum_list) = self._get_enabled_jobs(task, jobnum_list_ready)

        if self._show_blocker and jobnum_list_ready and not jobnum_list:  # No submission but ready jobs
            err_str_list = []
            if (n_retry_ok <= 0) or (n_mod_ok != 0):
                err_str_list.append('have hit their maximum number of retries')
            if (n_retry_ok != 0) and (n_mod_ok <= 0):
                err_str_list.append('are vetoed by the task module')
            err_delim = ' and '
            if n_retry_ok or n_mod_ok:
                err_delim = ' or '
            self._log.log_time(logging.WARNING, 'All remaining jobs %s!',
                               str.join(err_delim, err_str_list))
        self._show_blocker = not (len(jobnum_list_ready) > 0
                                  and len(jobnum_list) == 0)

        # Determine number of jobs to submit
        submit = len(jobnum_list)
        if self._njobs_inqueue > 0:
            submit = min(
                submit, self._njobs_inqueue -
                self.job_db.get_job_len(ClassSelector(JobClass.ATWMS)))
        if self._njobs_inflight > 0:
            submit = min(
                submit, self._njobs_inflight -
                self.job_db.get_job_len(ClassSelector(JobClass.PROCESSING)))
        if self._chunks_enabled and (self._chunks_submit > 0):
            submit = min(submit, self._chunks_submit)
        submit = max(submit, 0)

        if self._do_shuffle:
            return self._sample(jobnum_list, submit)
        return sorted(jobnum_list)[:submit]
Esempio n. 13
0
 def reset(self, wms, select):
     jobs = self.jobDB.getJobs(JobSelector.create(select, task=self._task))
     if jobs:
         self._log_user.warning('Resetting the following jobs:')
         self._reportClass(self.jobDB, self._task, jobs).display()
         if utils.getUserBool(
                 'Are you sure you want to reset the state of these jobs?',
                 False):
             self.cancel(
                 wms,
                 self.jobDB.getJobs(ClassSelector(JobClass.PROCESSING),
                                    jobs), False, False)
             for jobNum in jobs:
                 self.jobDB.commit(jobNum, Job())
Esempio n. 14
0
 def _logDisabledJobs(self):
     disabled = self.jobDB.getJobs(ClassSelector(JobClass.DISABLED))
     try:
         fp = SafeFile(self._disabled_jobs_logfile, 'w')
         fp.write(str.join('\n', imap(str, disabled)))
         fp.close()
     except Exception:
         raise JobError('Could not write disabled jobs to file %s!' %
                        self._disabled_jobs_logfile)
     if disabled:
         self._log_user_time.warning(
             'There are %d disabled jobs in this task!', len(disabled))
         self._log_user_time.debug(
             'Please refer to %s for a complete list of disabled jobs.',
             self._disabled_jobs_logfile)
Esempio n. 15
0
 def _log_disabled_jobs(self):
     disabled = self.job_db.get_job_list(ClassSelector(JobClass.DISABLED))
     try:
         with_file(SafeFile(self._disabled_jobs_logfile, 'w'),
                   lambda fp: fp.write(str.join('\n', imap(str, disabled))))
     except Exception:
         raise JobError('Could not write disabled jobs to file %s!' %
                        self._disabled_jobs_logfile)
     if disabled:
         self._log.log_time(logging.WARNING,
                            'There are %d disabled jobs in this task!',
                            len(disabled))
         self._log.log_time(
             logging.DEBUG,
             'Please refer to %s for a complete list of disabled jobs.',
             self._disabled_jobs_logfile)
Esempio n. 16
0
def get_script_object(config_file, job_selector_str, only_success=False, require_task=False):
	config = gc_create_config(config_file=config_file, load_only_old_config=True)
	(task, job_selector) = _get_job_selector_and_task(config, job_selector_str, require_task)
	if only_success:
		job_selector = JobSelector.create_instance('AndJobSelector',
			ClassSelector(JobClass.SUCCESS), job_selector)
	new_config = gc_create_config(config_file=config_file)
	jobs_config = new_config.change_view(set_sections=['jobs'])
	job_db = jobs_config.get_plugin('job database', 'TextFileJobDB', cls='JobDB',
		pkwargs={'job_selector': job_selector}, on_change=None)

	class ScriptObject(object):
		def __init__(self, config, new_config, task, job_db):
			(self.config, self.new_config) = (config, new_config)
			(self.task, self.job_db) = (task, job_db)

	return ScriptObject(config, new_config, task, job_db)
Esempio n. 17
0
 def reset(self, task, wms, select):
     jobnum_list = self.job_db.get_job_list(
         JobSelector.create(select, task=task))
     if jobnum_list:
         self._log.warning('Resetting the following jobs:')
         self._abort_report.show_report(self.job_db, jobnum_list)
         ask_user_msg = 'Are you sure you want to reset the state of these jobs?'
         if self._interactive_reset or self._uii.prompt_bool(
                 ask_user_msg, False):
             self.cancel(wms,
                         self.job_db.get_job_list(
                             ClassSelector(JobClass.PROCESSING),
                             jobnum_list),
                         interactive=False,
                         show_jobs=False)
             for jobnum in jobnum_list:
                 self.job_db.commit(jobnum, Job())
Esempio n. 18
0
 def __init__(self, config, datasource_name):
     InfoScanner.__init__(self, config, datasource_name)
     ext_config_fn = config.get_fn('source config')
     ext_config_raw = create_config(ext_config_fn,
                                    load_only_old_config=True)
     ext_config = ext_config_raw.change_view(set_sections=['global'])
     self._ext_work_dn = ext_config.get_work_path()
     logging.getLogger().disabled = True
     ext_workflow = ext_config.get_plugin('workflow',
                                          'Workflow:global',
                                          cls='Workflow',
                                          pkwargs={'backend': 'NullWMS'})
     logging.getLogger().disabled = False
     self._ext_task = ext_workflow.task
     job_selector = JobSelector.create(config.get('source job selector',
                                                  ''),
                                       task=self._ext_task)
     self._selected = sorted(
         ext_workflow.job_manager.job_db.get_job_list(
             AndJobSelector(ClassSelector(JobClass.SUCCESS), job_selector)))
Esempio n. 19
0
    def retrieve(self, task, wms):
        change = False
        jobnum_list = self._sample(
            self.job_db.get_job_list(ClassSelector(JobClass.DONE)),
            self._get_chunk_size(self._chunks_retrieve))

        job_output_iter = wms.retrieve_jobs(self._get_wms_args(jobnum_list))
        for (jobnum, exit_code, data, outputdir) in job_output_iter:
            job_obj = self.job_db.get_job(jobnum)
            if job_obj is None:
                continue

            if exit_code == 0:
                state = Job.SUCCESS
            elif exit_code == 107:  # set ABORTED instead of FAILED for errorcode 107
                state = Job.ABORTED
            else:
                state = Job.FAILED

            if state == Job.SUCCESS:
                if not self._output_processor.process(outputdir, task):
                    exit_code = 108
                    state = Job.FAILED

            if state != job_obj.state:
                change = True
                job_obj.set('retcode', exit_code)
                job_obj.set('runtime', data.get('TIME', -1))
                self._update(task, job_obj, jobnum, state)
                self._local_event_handler.on_job_output(
                    task, wms, job_obj, jobnum, exit_code)

            if abort():
                return False

        return change
Esempio n. 20
0
 def display(self):
     self._bar.update(
         len(self._jobDB.getJobs(ClassSelector(JobClass.SUCCESS))))
     sys.stdout.write(str(self._bar) + '\n')