def _setup_job_logs_retrieval(self, itask, event): """Set up remote job logs retrieval. For a task with a job completion event, i.e. succeeded, failed, (execution) retry. """ id_key = ((self.HANDLER_JOB_LOGS_RETRIEVE, event), str(itask.point), itask.tdef.name, itask.submit_num) if itask.task_owner: user_at_host = itask.task_owner + "@" + itask.task_host else: user_at_host = itask.task_host events = (self.EVENT_FAILED, self.EVENT_RETRY, self.EVENT_SUCCEEDED) if (event not in events or user_at_host in [get_user() + '@localhost', 'localhost'] or not self.get_host_conf(itask, "retrieve job logs") or id_key in self.event_timers): return retry_delays = self.get_host_conf(itask, "retrieve job logs retry delays") if not retry_delays: retry_delays = [0] self.event_timers[id_key] = TaskActionTimer( TaskJobLogsRetrieveContext( self.HANDLER_JOB_LOGS_RETRIEVE, # key self.HANDLER_JOB_LOGS_RETRIEVE, # ctx_type user_at_host, self.get_host_conf(itask, "retrieve job logs max size"), ), retry_delays)
def _setup_event_mail(self, itask, event): """Set up task event notification, by email.""" if event in self.NON_UNIQUE_EVENTS: key1 = (self.HANDLER_MAIL, '%s-%d' % (event, itask.non_unique_events.get(event, 1))) else: key1 = (self.HANDLER_MAIL, event) id_key = (key1, str(itask.point), itask.tdef.name, itask.submit_num) if (id_key in self.event_timers or event not in self._get_events_conf( itask, "mail events", [])): return retry_delays = self._get_events_conf(itask, "mail retry delays") if not retry_delays: retry_delays = [0] self.event_timers[id_key] = TaskActionTimer( TaskEventMailContext( self.HANDLER_MAIL, # key self.HANDLER_MAIL, # ctx_type self._get_events_conf( # mail_from itask, "mail from", "notifications@" + get_host(), ), self._get_events_conf(itask, "mail to", get_user()), # mail_to self._get_events_conf(itask, "mail smtp"), # mail_smtp ), retry_delays)
def _set_retry_timers(itask, rtconfig=None): """Set try number and retry delays.""" if rtconfig is None: rtconfig = itask.tdef.rtconfig try: no_retry = ( rtconfig[itask.tdef.run_mode + ' mode']['disable retries']) except KeyError: no_retry = False if not no_retry: for key, cfg_key in [ (TASK_STATUS_SUBMIT_RETRYING, 'submission retry delays'), (TASK_STATUS_RETRYING, 'execution retry delays')]: delays = rtconfig['job'][cfg_key] try: itask.try_timers[key].set_delays(delays) except KeyError: itask.try_timers[key] = TaskActionTimer(delays=delays)
def _reset_job_timers(self, itask): """Set up poll timer and timeout for task.""" if itask.state.status not in TASK_STATUSES_ACTIVE: # Reset, task not active itask.timeout = None itask.poll_timer = None return ctx = (itask.submit_num, itask.state.status) if itask.poll_timer and itask.poll_timer.ctx == ctx: return # Set poll timer # Set timeout timeref = None # reference time, submitted or started time timeout = None # timeout in setting if itask.state.status == TASK_STATUS_RUNNING: timeref = itask.summary['started_time'] timeout_key = 'execution timeout' timeout = self._get_events_conf(itask, timeout_key) delays = self.get_host_conf( itask, 'execution polling intervals', skey='job', default=[900]) # Default 15 minute intervals if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: time_limit = itask.summary[self.KEY_EXECUTE_TIME_LIMIT] try: host_conf = self.get_host_conf(itask, 'batch systems') batch_sys_conf = host_conf[itask.summary['batch_sys_name']] except (TypeError, KeyError): batch_sys_conf = {} time_limit_delays = batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]) timeout = time_limit + sum(time_limit_delays) # Remove execessive polling before time limit while sum(delays) > time_limit: del delays[-1] # But fill up the gap before time limit if delays: size = int((time_limit - sum(delays)) / delays[-1]) delays.extend([delays[-1]] * size) time_limit_delays[0] += time_limit - sum(delays) delays += time_limit_delays else: # if itask.state.status == TASK_STATUS_SUBMITTED: timeref = itask.summary['submitted_time'] timeout_key = 'submission timeout' timeout = self._get_events_conf(itask, timeout_key) delays = self.get_host_conf( itask, 'submission polling intervals', skey='job', default=[900]) # Default 15 minute intervals try: itask.timeout = timeref + float(timeout) timeout_str = intvl_as_str(timeout) except (TypeError, ValueError): itask.timeout = None timeout_str = None itask.poll_timer = TaskActionTimer(ctx=ctx, delays=delays) # Log timeout and polling schedule message = 'health check settings: %s=%s' % (timeout_key, timeout_str) # Attempt to group idenitical consecutive delays as N*DELAY,... if itask.poll_timer.delays: items = [] # [(number of item - 1, item), ...] for delay in itask.poll_timer.delays: if items and items[-1][1] == delay: items[-1][0] += 1 else: items.append([0, delay]) message += ', polling intervals=' for num, item in items: if num: message += '%d*' % (num + 1) message += '%s,' % intvl_as_str(item) message += '...' LOG.info(message, itask=itask) # Set next poll time self.check_poll_time(itask)
def _setup_custom_event_handlers(self, itask, event, message): """Set up custom task event handlers.""" handlers = self._get_events_conf(itask, event + ' handler') if (handlers is None and event in self._get_events_conf( itask, 'handler events', [])): handlers = self._get_events_conf(itask, 'handlers') if handlers is None: return retry_delays = self._get_events_conf( itask, 'handler retry delays', self.get_host_conf(itask, "task event handler retry delays")) if not retry_delays: retry_delays = [0] # There can be multiple custom event handlers for i, handler in enumerate(handlers): key1 = ("%s-%02d" % (self.HANDLER_CUSTOM, i), event) id_key = (key1, str(itask.point), itask.tdef.name, itask.submit_num) if id_key in self.event_timers: continue # Note: user@host may not always be set for a submit number, e.g. # on late event or if host select command fails. Use null string to # prevent issues in this case. user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '') if user_at_host and '@' not in user_at_host: # (only has 'user@' on the front if user is not suite owner). user_at_host = '%s@%s' % (get_user(), user_at_host) # Custom event handler can be a command template string # or a command that takes 4 arguments (classic interface) # Note quote() fails on None, need str(None). try: handler_data = { "event": quote(event), "suite": quote(self.suite), "point": quote(str(itask.point)), "name": quote(itask.tdef.name), "submit_num": itask.submit_num, "id": quote(itask.identity), "message": quote(message), "batch_sys_name": quote(str(itask.summary['batch_sys_name'])), "batch_sys_job_id": quote(str(itask.summary['submit_method_id'])), "submit_time": quote(str(itask.summary['submitted_time_string'])), "start_time": quote(str(itask.summary['started_time_string'])), "finish_time": quote(str(itask.summary['finished_time_string'])), "user@host": quote(user_at_host) } if self.suite_cfg: for key, value in self.suite_cfg.items(): if key == "URL": handler_data["suite_url"] = quote(value) else: handler_data["suite_" + key] = quote(value) if itask.tdef.rtconfig['meta']: for key, value in itask.tdef.rtconfig['meta'].items(): if key == "URL": handler_data["task_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s/%s/%02d %s bad template: %s" % ( itask.point, itask.tdef.name, itask.submit_num, key1, exc) LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s' '%s'" % (handler, event, self.suite, itask.identity, message) LOG.debug("Queueing %s handler: %s" % (event, cmd), itask=itask) self.event_timers[id_key] = (TaskActionTimer( CustomTaskEventHandlerContext( key1, self.HANDLER_CUSTOM, cmd, ), retry_delays))
def _prep_submit_task_job_impl(self, suite, itask, rtconfig): """Helper for self._prep_submit_task_job.""" # Submit number itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num itask.task_owner = rtconfig['remote']['owner'] if itask.task_owner: owner_at_host = itask.task_owner + "@" + itask.task_host else: owner_at_host = itask.task_host itask.summary['host'] = owner_at_host itask.summary['job_hosts'][itask.submit_num] = owner_at_host itask.summary['batch_sys_name'] = rtconfig['job']['batch system'] for name in rtconfig['extra log files']: itask.summary['logfiles'].append(expandvars(name)) try: batch_sys_conf = self.task_events_mgr.get_host_conf( itask, 'batch systems')[rtconfig['job']['batch system']] except (TypeError, KeyError): batch_sys_conf = {} try: itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float( rtconfig['job']['execution time limit']) except TypeError: pass if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10 # minutes after time limit exceeded itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = (TaskActionTimer( delays=batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]))) for label, key in [ ('submission polling intervals', TASK_STATUS_SUBMITTED), ('execution polling intervals', TASK_STATUS_RUNNING) ]: if key in itask.poll_timers: itask.poll_timers[key].reset() else: values = self.task_events_mgr.get_host_conf(itask, label, skey='job') if values: itask.poll_timers[key] = TaskActionTimer(delays=values) scripts = self._get_job_scripts(itask, rtconfig) # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) # Location of job file, etc self._create_job_log_path(suite, itask) job_d = self.task_events_mgr.get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) job_file_path = os.path.join( GLOBAL_CFG.get_derived_host_item(suite, "suite job log directory", itask.task_host, itask.task_owner), job_d, self.JOB_FILE_BASE) return { 'batch_system_name': rtconfig['job']['batch system'], 'batch_submit_command_template': (rtconfig['job']['batch submit command template']), 'batch_system_conf': batch_sys_conf, 'directives': rtconfig['directives'], 'environment': rtconfig['environment'], 'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT], 'env-script': rtconfig['env-script'], 'err-script': rtconfig['err-script'], 'host': itask.task_host, 'init-script': rtconfig['init-script'], 'job_file_path': job_file_path, 'job_d': job_d, 'namespace_hierarchy': itask.tdef.namespace_hierarchy, 'owner': itask.task_owner, 'param_env_tmpl': rtconfig['parameter environment templates'], 'param_var': itask.tdef.param_var, 'post-script': scripts[2], 'pre-script': scripts[0], 'remote_suite_d': rtconfig['remote']['suite definition directory'], 'script': scripts[1], 'shell': rtconfig['job']['shell'], 'submit_num': itask.submit_num, 'suite_name': suite, 'task_id': itask.identity, 'try_num': itask.get_try_num(), 'work_d': rtconfig['work sub-directory'], }
def _prep_submit_task_job_impl(self, suite, itask): """Helper for self._prep_submit_task_job.""" overrides = self.task_events_mgr.broadcast_mgr.get_broadcast( itask.identity) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides) else: rtconfig = itask.tdef.rtconfig # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) # Submit number and try number LOG.debug("[%s] -incrementing submit number" % (itask.identity,)) itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num itask.local_job_file_path = None self.suite_db_mgr.put_insert_task_jobs(itask, { "is_manual_submit": itask.is_manual_submit, "try_num": itask.get_try_num(), "time_submit": get_current_time_string(), }) itask.summary['batch_sys_name'] = rtconfig['job']['batch system'] for name in rtconfig['extra log files']: itask.summary['logfiles'].append(expandvars(name)) # Determine task host settings now, just before job submission, # because dynamic host selection may be used. # host may be None (= run task on suite host) itask.task_host = get_task_host(rtconfig['remote']['host']) if not itask.task_host: itask.task_host = 'localhost' elif itask.task_host != "localhost": LOG.info("[%s] -Task host: %s" % ( itask.identity, itask.task_host)) itask.task_owner = rtconfig['remote']['owner'] if itask.task_owner: user_at_host = itask.task_owner + "@" + itask.task_host else: user_at_host = itask.task_host itask.summary['host'] = user_at_host itask.summary['job_hosts'][itask.submit_num] = user_at_host try: batch_sys_conf = self.task_events_mgr.get_host_conf( itask, 'batch systems')[rtconfig['job']['batch system']] except (TypeError, KeyError): batch_sys_conf = {} try: itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float( rtconfig['job']['execution time limit']) except TypeError: pass if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10 # minutes after time limit exceeded itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = ( TaskActionTimer(delays=batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]))) for label, key in [ ('submission polling intervals', TASK_STATUS_SUBMITTED), ('execution polling intervals', TASK_STATUS_RUNNING)]: if key in itask.poll_timers: itask.poll_timers[key].reset() else: values = self.task_events_mgr.get_host_conf( itask, label, skey='job') if values: itask.poll_timers[key] = TaskActionTimer(delays=values) self.init_host(suite, itask.task_host, itask.task_owner) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.suite_db_mgr.put_update_task_jobs(itask, { "user_at_host": user_at_host, "batch_sys_name": itask.summary['batch_sys_name'], }) itask.is_manual_submit = False scripts = self._get_job_scripts(itask, rtconfig) # Location of job file, etc self._create_job_log_path(suite, itask) job_d = self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num) job_file_path = os.path.join( GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory", itask.task_host, itask.task_owner), job_d, self.JOB_FILE_BASE) return { 'batch_system_name': rtconfig['job']['batch system'], 'batch_submit_command_template': ( rtconfig['job']['batch submit command template']), 'batch_system_conf': batch_sys_conf, 'directives': rtconfig['directives'], 'environment': rtconfig['environment'], 'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT], 'env-script': rtconfig['env-script'], 'err-script': rtconfig['err-script'], 'host': itask.task_host, 'init-script': rtconfig['init-script'], 'job_file_path': job_file_path, 'job_d': job_d, 'namespace_hierarchy': itask.tdef.namespace_hierarchy, 'owner': itask.task_owner, 'param_var': itask.tdef.param_var, 'post-script': scripts[2], 'pre-script': scripts[0], 'remote_suite_d': rtconfig['remote']['suite definition directory'], 'script': scripts[1], 'shell': rtconfig['job']['shell'], 'submit_num': itask.submit_num, 'suite_name': suite, 'task_id': itask.identity, 'try_num': itask.get_try_num(), 'work_d': rtconfig['work sub-directory'], }