def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message.""" if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) if (TASK_STATUS_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_RETRYING].next() is None): # No retry lined up: definitive failure. self.pflag = True if itask.state.reset_state(TASK_STATUS_FAILED): self.setup_event_handlers(itask, "failed", message) LOG.critical("job(%02d) %s" % ( itask.submit_num, "failed"), itask=itask) else: # There is a retry lined up delay_msg = "retrying in %s" % ( itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str()) msg = "failed, %s" % (delay_msg) LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask) itask.summary['latest_message'] = msg if itask.state.reset_state(TASK_STATUS_RETRYING): self.setup_event_handlers( itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg)) self._reset_job_timers(itask)
def _process_message_succeeded(self, itask, event_time): """Helper for process_message, handle a succeeded message.""" self.pflag = True itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 0, "time_run_exit": event_time, }) # Update mean elapsed time only on task succeeded. if itask.summary['started_time'] is not None: itask.tdef.elapsed_times.append( itask.summary['finished_time'] - itask.summary['started_time']) if not itask.state.outputs.all_completed(): msg = "" for output in itask.state.outputs.get_not_completed(): if output not in [TASK_OUTPUT_EXPIRED, TASK_OUTPUT_SUBMIT_FAILED, TASK_OUTPUT_FAILED]: msg += "\n " + output if msg: LOG.info("Succeeded with outputs not completed: %s" % msg, itask=itask) if itask.state.reset_state(TASK_STATUS_SUCCEEDED): self.setup_event_handlers(itask, "succeeded", "job succeeded") self._reset_job_timers(itask)
def report(self, request, server_obj): """Log client requests with identifying information. In debug mode log all requests including task messages. Otherwise log all user commands, and just the first info request from each client. """ if threading.current_thread().__class__.__name__ == '_MainThread': # Server methods may be called internally as well as by clients. return auth_user, prog_name, user, host, uuid, priv_level = get_client_info() name = server_obj.__class__.__name__ log_me = ( cylc.flags.debug or name in ["SuiteCommandServer", "ExtTriggerServer", "BroadcastServer"] or (name not in ["SuiteIdServer", "TaskMessageServer"] and uuid not in self.clients)) if log_me: LOG.debug( self.__class__.LOG_CONNECT_ALLOWED_TMPL % ( user, host, prog_name, priv_level, uuid) ) LOG.info( self.__class__.LOG_COMMAND_TMPL % ( request, user, host, prog_name, uuid)) if name == "SuiteIdServer": self._num_id_requests += 1 self.report_id_requests() self.clients[uuid] = datetime.datetime.utcnow() self._housekeep()
def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None): """Poll jobs of specified tasks. Any job that is or was submitted or running can be polled, except for retrying tasks - which would poll (correctly) as failed. And don't poll succeeded tasks by default. This method uses _poll_task_jobs_callback() and _manip_task_jobs_callback() as help/callback methods. _poll_task_job_callback() executes one specific job. """ poll_me = [] pollable = [ TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED ] for itask in itasks: if itask.state.status in pollable or ( itask.state.status == TASK_STATUS_SUCCEEDED and poll_succ): poll_me.append(itask) else: LOG.debug("skipping %s: not pollable, " "or skipping 'succeeded' tasks" % itask.identity) if poll_me: if msg is not None: LOG.info(msg) self._run_job_cmd(self.JOBS_POLL, suite, poll_me, self._poll_task_jobs_callback)
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error(self.EVENT_SUBMIT_FAILED, itask=itask) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs(itask, { "time_submit_exit": get_current_time_string(), "submit_status": 1, }) itask.summary['submit_method_id'] = None if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. self.pflag = True # See github #476. if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers( itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) else: # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask) itask.summary['latest_message'] = msg if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING): self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self._reset_job_timers(itask)
def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None): """Poll jobs of specified tasks. Any job that is or was submitted or running can be polled, except for retrying tasks - which would poll (correctly) as failed. And don't poll succeeded tasks by default. This method uses _poll_task_jobs_callback() and _manip_task_jobs_callback() as help/callback methods. _poll_task_job_callback() executes one specific job. """ to_poll_tasks = [] pollable_statuses = set([ TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED]) if poll_succ: pollable_statuses.add(TASK_STATUS_SUCCEEDED) for itask in itasks: if itask.state.status in pollable_statuses: to_poll_tasks.append(itask) else: LOG.debug("skipping %s: not pollable, " "or skipping 'succeeded' tasks" % itask.identity) if to_poll_tasks: if msg is not None: LOG.info(msg) self._run_job_cmd( self.JOBS_POLL, suite, to_poll_tasks, self._poll_task_jobs_callback)
def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message.""" if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) if (TASK_STATUS_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_RETRYING].next() is None): # No retry lined up: definitive failure. self.pflag = True if itask.state.reset_state(TASK_STATUS_FAILED): self.setup_event_handlers(itask, "failed", message) LOG.critical("job(%02d) %s" % (itask.submit_num, "failed"), itask=itask) else: # There is a retry lined up delay_msg = "retrying in %s" % ( itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str()) msg = "failed, %s" % (delay_msg) LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask) itask.summary['latest_message'] = msg if itask.state.reset_state(TASK_STATUS_RETRYING): self.setup_event_handlers( itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg)) self._reset_job_timers(itask)
def _process_message_succeeded(self, itask, event_time): """Helper for process_message, handle a succeeded message.""" self.pflag = True itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 0, "time_run_exit": event_time, }) # Update mean elapsed time only on task succeeded. if itask.summary['started_time'] is not None: itask.tdef.elapsed_times.append(itask.summary['finished_time'] - itask.summary['started_time']) if not itask.state.outputs.all_completed(): msg = "" for output in itask.state.outputs.get_not_completed(): if output not in [ TASK_OUTPUT_EXPIRED, TASK_OUTPUT_SUBMIT_FAILED, TASK_OUTPUT_FAILED ]: msg += "\n " + output if msg: LOG.info("Succeeded with outputs not completed: %s" % msg, itask=itask) if itask.state.reset_state(TASK_STATUS_SUCCEEDED): self.setup_event_handlers(itask, "succeeded", "job succeeded") self._reset_job_timers(itask)
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error(self.EVENT_SUBMIT_FAILED, itask=itask) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs( itask, { "time_submit_exit": get_current_time_string(), "submit_status": 1, }) itask.summary['submit_method_id'] = None if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. self.pflag = True # See github #476. if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) else: # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask) itask.summary['latest_message'] = msg if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING): self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self._reset_job_timers(itask)
def load_db_broadcast_states(self, row_idx, row): """Load broadcast variables from runtime DB broadcast states row.""" if row_idx == 0: LOG.info("LOADING broadcast states") point, namespace, key, value = row sections = [] cur_key = key if "]" in cur_key: sections = self.REC_SECTION.findall(cur_key) cur_key = cur_key.rsplit(r"]", 1)[-1] with self.lock: self.broadcasts.setdefault(point, {}) self.broadcasts[point].setdefault(namespace, {}) dict_ = self.broadcasts[point][namespace] for section in sections: dict_.setdefault(section, {}) dict_ = dict_[section] dict_[cur_key] = value LOG.info( CHANGE_FMT.strip() % { "change": CHANGE_PREFIX_SET, "point": point, "namespace": namespace, "key": key, "value": value })
def _check_access_priv_and_report(self, required_privilege_level, log_info=True): """Check access privilege and log requests with identifying info. In debug mode log all requests including task messages. Otherwise log all user commands, and just the first info command from each client. Return: dict: containing the client session """ self._check_access_priv(required_privilege_level) command = inspect.currentframe().f_back.f_code.co_name auth_user, prog_name, user, host, uuid = _get_client_info() priv_level = self._get_priv_level(auth_user) LOG.debug(self.__class__.LOG_CONNECT_ALLOWED_TMPL % (user, host, prog_name, priv_level, uuid)) if cylc.flags.debug or uuid not in self.clients and log_info: LOG.info(self.__class__.LOG_COMMAND_TMPL % (command, user, host, prog_name, uuid)) self.clients.setdefault(uuid, {}) self.clients[uuid]['time'] = time() self._housekeep() return self.clients[uuid]
def clear_broadcast(self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed broadcasts. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ # If cancel_settings defined, only clear specific broadcasts cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear broadcasts modified_settings = [] with self.lock: for point_string, point_string_settings in self.broadcasts.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options(self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True) LOG.info(get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) return (modified_settings, bad_options)
def _run_event_handlers_callback(self, proc_ctx, abort_on_error=False): """Callback on completion of a suite event handler.""" if proc_ctx.ret_code: msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1] LOG.error(str(proc_ctx)) ERR.error(msg) if abort_on_error: raise SuiteEventError(msg) else: LOG.info(str(proc_ctx))
def _run_event_handlers_callback(proc_ctx, abort_on_error=False): """Callback on completion of a suite event handler.""" if proc_ctx.ret_code: msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1] LOG.error(str(proc_ctx)) LOG.error(msg) if abort_on_error: raise SuiteEventError(msg) else: LOG.info(str(proc_ctx))
def satisfy_xclock(self, itask): """Attempt to satisfy itask's clock trigger, if it has one.""" label, sig, ctx, satisfied = self._get_xclock(itask) if satisfied: return if wall_clock(*ctx.func_args, **ctx.func_kwargs): satisfied = True itask.state.xclock = (label, True) self.sat_xclock.append(sig) LOG.info('clock xtrigger satisfied: %s = %s' % (label, str(ctx)))
def signout(self, server_obj): """Force forget this client (for use by GUI etc.).""" caller = server_obj.getLocalStorage().caller LOG.info( self.__class__.LOG_SIGNOUT_TMPL % ( caller.user, caller.host, caller.prog_name, caller.uuid)) try: del self.clients[caller.uuid] except KeyError: # Already forgotten. pass self._housekeep()
def signout(self, server_obj): """Force forget this client (for use by GUI etc.).""" caller = server_obj.getLocalStorage().caller LOG.info( self.__class__.LOG_SIGNOUT_TMPL % ( caller.user, caller.host, caller.prog_name, caller.uuid)) try: del self.clients[caller.uuid] except: # Already forgotten. pass self._housekeep()
def put_broadcast(self, point_strings=None, namespaces=None, settings=None): """Add new broadcast settings (server side interface). Return a tuple (modified_settings, bad_options) where: modified_settings is list of modified settings in the form: [("20200202", "foo", {"command scripting": "true"}, ...] bad_options is as described in the docstring for self.clear(). """ modified_settings = [] bad_point_strings = [] bad_namespaces = [] with self.lock: for setting in settings: for point_string in point_strings: # Standardise the point and check its validity. bad_point = False try: point_string = standardise_point_string(point_string) except PointParsingError: if point_string != '*': bad_point_strings.append(point_string) bad_point = True if not bad_point and point_string not in self.broadcasts: self.broadcasts[point_string] = {} for namespace in namespaces: if namespace not in self.linearized_ancestors: bad_namespaces.append(namespace) elif not bad_point: if namespace not in self.broadcasts[point_string]: self.broadcasts[point_string][namespace] = {} self._addict( self.broadcasts[point_string][namespace], setting) modified_settings.append( (point_string, namespace, setting)) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings) LOG.info(get_broadcast_change_report(modified_settings)) bad_options = {} if bad_point_strings: bad_options["point_strings"] = bad_point_strings if bad_namespaces: bad_options["namespaces"] = bad_namespaces return modified_settings, bad_options
def check_task_jobs(self, suite, task_pool): """Check submission and execution timeout and polling timers. Poll tasks that have timed out and/or have reached next polling time. """ now = time() poll_tasks = set() for itask in task_pool.get_tasks(): if self.task_events_mgr.check_job_time(itask, now): poll_tasks.add(itask) if itask.poll_timer.delay is not None: LOG.info('poll now, (next in %s)' % (itask.poll_timer.delay_timeout_as_str()), itask=itask) if poll_tasks: self.poll_task_jobs(suite, poll_tasks)
def check_task_jobs(self, suite, task_pool): """Check submission and execution timeout and polling timers. Poll tasks that have timed out and/or have reached next polling time. """ now = time() poll_tasks = set() for itask in task_pool.get_tasks(): if self.task_events_mgr.check_job_time(itask, now): poll_tasks.add(itask) if itask.poll_timer.delay is not None: LOG.info( 'poll now, (next in %s)' % ( itask.poll_timer.delay_timeout_as_str()), itask=itask) if poll_tasks: self.poll_task_jobs(suite, poll_tasks)
def report_id_requests(self): """Report the frequency of identification (scan) requests.""" current_time = time.time() interval = current_time - self._id_start_time if interval > self.CLIENT_ID_REPORT_SECONDS: rate = float(self._num_id_requests) / interval if rate > self.CLIENT_ID_MIN_REPORT_RATE: LOG.warning( self.__class__.LOG_IDENTIFY_TMPL % ( self._num_id_requests, interval) ) elif cylc.flags.debug: LOG.info( self.__class__.LOG_IDENTIFY_TMPL % ( self._num_id_requests, interval) ) self._id_start_time = current_time self._num_id_requests = 0
def _process_message_submitted(self, itask, event_time): """Helper for process_message, handle a submit-succeeded message.""" try: LOG.info( ('job[%(submit_num)02d] submitted to' ' %(host)s:%(batch_sys_name)s[%(submit_method_id)s]') % itask.summary, itask=itask) except KeyError: pass self.suite_db_mgr.put_update_task_jobs(itask, { "time_submit_exit": event_time, "submit_status": 0, "batch_sys_job_id": itask.summary.get('submit_method_id')}) if itask.tdef.run_mode == 'simulation': # Simulate job execution at this point. itask.set_summary_time('submitted', event_time) itask.set_summary_time('started', event_time) itask.state.reset_state(TASK_STATUS_RUNNING) itask.state.outputs.set_completion(TASK_OUTPUT_STARTED, True) return itask.set_summary_time('submitted', event_time) # Unset started and finished times in case of resubmission. itask.set_summary_time('started') itask.set_summary_time('finished') itask.summary['latest_message'] = TASK_OUTPUT_SUBMITTED self.setup_event_handlers( itask, TASK_OUTPUT_SUBMITTED, 'job submitted') self.pflag = True if itask.state.status == TASK_STATUS_READY: # The job started message can (rarely) come in before the submit # command returns - in which case do not go back to 'submitted'. itask.state.reset_state(TASK_STATUS_SUBMITTED) try: itask.timeout_timers[TASK_STATUS_SUBMITTED] = ( itask.summary['submitted_time'] + float(self._get_events_conf(itask, 'submission timeout'))) except (TypeError, ValueError): itask.timeout_timers[TASK_STATUS_SUBMITTED] = None self.set_poll_time(itask)
def _process_message_submitted(self, itask, event_time): """Helper for process_message, handle a submit-succeeded message.""" try: LOG.info(('job[%(submit_num)02d] submitted to' ' %(host)s:%(batch_sys_name)s[%(submit_method_id)s]') % itask.summary, itask=itask) except KeyError: pass self.suite_db_mgr.put_update_task_jobs( itask, { "time_submit_exit": event_time, "submit_status": 0, "batch_sys_job_id": itask.summary.get('submit_method_id') }) if itask.tdef.run_mode == 'simulation': # Simulate job execution at this point. itask.set_event_time('started', event_time) itask.state.reset_state(TASK_STATUS_RUNNING) itask.state.outputs.set_completion(TASK_OUTPUT_STARTED, True) return itask.set_event_time('submitted', event_time) itask.set_event_time('started') itask.set_event_time('finished') itask.summary['latest_message'] = TASK_OUTPUT_SUBMITTED self.setup_event_handlers(itask, TASK_OUTPUT_SUBMITTED, 'job submitted') self.pflag = True if itask.state.status == TASK_STATUS_READY: # In rare occassions, the submit command of a batch system has sent # the job to its server, and the server has started the job before # the job submit command returns. itask.state.reset_state(TASK_STATUS_SUBMITTED) try: itask.timeout_timers[TASK_STATUS_SUBMITTED] = ( itask.summary['submitted_time'] + float(self._get_events_conf(itask, 'submission timeout'))) except (TypeError, ValueError): itask.timeout_timers[TASK_STATUS_SUBMITTED] = None self.set_poll_time(itask)
def _process_message_succeeded(self, itask, event_time): """Helper for process_message, handle a succeeded message.""" self.pflag = True itask.set_event_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 0, "time_run_exit": event_time, }) # Update mean elapsed time only on task succeeded. if itask.summary['started_time'] is not None: itask.tdef.elapsed_times.append(itask.summary['finished_time'] - itask.summary['started_time']) if not itask.state.outputs.all_completed(): message = "Succeeded with unreported outputs:" for output in itask.state.outputs.get_not_completed(): message += "\n " + output LOG.info(message, itask=itask) itask.state.reset_state(TASK_STATUS_SUCCEEDED) self.setup_event_handlers(itask, "succeeded", "job succeeded")
def set_poll_time(itask, now=None): """Set the next task execution/submission poll time. If now is set, set the timer only if the previous delay is done. Return the next delay. """ key = itask.state.status timer = itask.poll_timers.get(key) if timer is None: return if now is not None and not timer.is_delay_done(now): return if timer.num is None: timer.num = 0 delay = timer.next(no_exhaust=True) if delay is not None: LOG.info('next job poll in %s (after %s)' % (timer.delay_as_seconds(), timer.timeout_as_str()), itask=itask) return delay
def set_poll_time(itask, now=None): """Set the next task execution/submission poll time. If now is set, set the timer only if the previous delay is done. Return the next delay. """ key = itask.state.status timer = itask.poll_timers.get(key) if timer is None: return if now is not None and not timer.is_delay_done(now): return if timer.num is None: timer.num = 0 delay = timer.next(no_exhaust=True) if delay is not None: LOG.info( 'next job poll in %s (after %s)' % ( timer.delay_as_seconds(), timer.timeout_as_str()), itask=itask) return delay
def print_result(result): """Print result""" if result['OUT']: LOG.info('result> ' + result['OUT'].strip()) if result['ERR']: LOG.info('FAILED> ' + result['CMD']) LOG.info(result['ERR'].strip())
def log_task_job_activity(ctx, suite, point, name, submit_num=None): """Log an activity for a task job.""" ctx_str = str(ctx) if not ctx_str: return if isinstance(ctx.cmd_key, tuple): # An event handler submit_num = ctx.cmd_key[-1] job_activity_log = get_task_job_activity_log(suite, point, name, submit_num) try: with open(job_activity_log, "ab") as handle: handle.write(ctx_str + '\n') except IOError as exc: # This happens when there is no job directory, e.g. if job host # selection command causes an submission failure, there will be no job # directory. In this case, just send the information to the suite log. LOG.debug(exc) LOG.info(ctx_str) if ctx.cmd and ctx.ret_code: LOG.error(ctx_str) elif ctx.cmd: LOG.debug(ctx_str)
def log_task_job_activity(ctx, suite, point, name, submit_num=None): """Log an activity for a task job.""" ctx_str = str(ctx) if not ctx_str: return if isinstance(ctx.cmd_key, tuple): # An event handler submit_num = ctx.cmd_key[-1] job_activity_log = get_task_job_activity_log( suite, point, name, submit_num) try: with open(job_activity_log, "ab") as handle: handle.write(ctx_str + '\n') except IOError as exc: # This happens when there is no job directory, e.g. if job host # selection command causes an submission failure, there will be no job # directory. In this case, just send the information to the suite log. LOG.debug(exc) LOG.info(ctx_str) if ctx.cmd and ctx.ret_code: LOG.error(ctx_str) elif ctx.cmd: LOG.debug(ctx_str)
def _process_message_submitted(self, itask, event_time): """Helper for process_message, handle a submit-succeeded message.""" try: LOG.info(('job[%(submit_num)02d] submitted to' ' %(host)s:%(batch_sys_name)s[%(submit_method_id)s]') % itask.summary, itask=itask) except KeyError: pass self.suite_db_mgr.put_update_task_jobs( itask, { "time_submit_exit": event_time, "submit_status": 0, "batch_sys_job_id": itask.summary.get('submit_method_id') }) if itask.tdef.run_mode == 'simulation': # Simulate job execution at this point. itask.set_summary_time('submitted', event_time) itask.set_summary_time('started', event_time) itask.state.reset_state(TASK_STATUS_RUNNING) itask.state.outputs.set_completion(TASK_OUTPUT_STARTED, True) return itask.set_summary_time('submitted', event_time) # Unset started and finished times in case of resubmission. itask.set_summary_time('started') itask.set_summary_time('finished') itask.summary['latest_message'] = TASK_OUTPUT_SUBMITTED self.pflag = True if itask.state.status == TASK_STATUS_READY: # The job started message can (rarely) come in before the submit # command returns - in which case do not go back to 'submitted'. if itask.state.reset_state(TASK_STATUS_SUBMITTED): self.setup_event_handlers(itask, TASK_OUTPUT_SUBMITTED, 'job submitted') self._reset_job_timers(itask)
def _check_access_priv_and_report( self, required_privilege_level, log_info=True): """Check access privilege and log requests with identifying info. In debug mode log all requests including task messages. Otherwise log all user commands, and just the first info command from each client. Return: dict: containing the client session """ self._check_access_priv(required_privilege_level) command = inspect.currentframe().f_back.f_code.co_name auth_user, prog_name, user, host, uuid = _get_client_info() priv_level = self._get_priv_level(auth_user) LOG.debug(self.__class__.LOG_CONNECT_ALLOWED_TMPL % ( user, host, prog_name, priv_level, uuid)) if cylc.flags.debug or uuid not in self.clients and log_info: LOG.info(self.__class__.LOG_COMMAND_TMPL % ( command, user, host, prog_name, uuid)) self.clients.setdefault(uuid, {}) self.clients[uuid]['time'] = time() self._housekeep() return self.clients[uuid]
def main(): """Manual test playground.""" log = logging.getLogger(LOG) log.setLevel(logging.INFO) # or logging.DEBUG handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.DEBUG) log.addHandler(handler) def print_result(result): """Print result""" if result['OUT']: LOG.info('result> ' + result['OUT'].strip()) if result['ERR']: LOG.info('FAILED> ' + result['CMD']) LOG.info(result['ERR'].strip()) pool = SuiteProcPool(3) for i in range(3): com = "sleep 5 && echo Hello from JOB " + str(i) pool.put_command(SuiteProcPool.JOBS_SUBMIT, com, print_result) com = "sleep 5 && echo Hello from POLL " + str(i) pool.put_command("poll", com, print_result) com = "sleep 5 && echo Hello from HANDLER " + str(i) pool.put_command("event-handler", com, print_result) com = "sleep 5 && echo Hello from HANDLER && badcommand" pool.put_command("event-handler", com, print_result) LOG.info(' sleeping') time.sleep(3) pool.handle_results_async() LOG.info(' sleeping') time.sleep(3) pool.close() # pool.terminate() pool.handle_results_async() LOG.info(' sleeping') time.sleep(3) pool.join() pool.handle_results_async()
def clear(self, point_strings=None, namespaces=None, cancel_settings=None): """Clear settings globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed settings. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ if hasattr(cherrypy.request, "json"): point_strings = ( cherrypy.request.json.get("point_strings", point_strings)) namespaces = ( cherrypy.request.json.get("namespaces", namespaces)) cancel_settings = ( cherrypy.request.json.get("cancel_settings", cancel_settings)) point_strings = unicode_encode(point_strings) namespaces = unicode_encode(namespaces) cancel_settings = unicode_encode(cancel_settings) # If cancel_settings defined, only clear specific settings cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear settings modified_settings = [] with self.lock: for point_string, point_string_settings in self.settings.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options( self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self._append_db_queue(modified_settings, is_cancel=True) LOG.info( get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) return (modified_settings, bad_options)
def put(self, point_strings=None, namespaces=None, settings=None, not_from_client=False): """Add new broadcast settings (server side interface). Return a tuple (modified_settings, bad_options) where: modified_settings is list of modified settings in the form: [("20200202", "foo", {"command scripting": "true"}, ...] bad_options is as described in the docstring for self.clear(). """ check_access_priv(self, 'full-control') self.report('broadcast_put') if not not_from_client: point_strings = ( cherrypy.request.json.get("point_strings", point_strings)) namespaces = ( cherrypy.request.json.get("namespaces", namespaces)) settings = ( cherrypy.request.json.get("settings", settings)) point_strings = unicode_encode(point_strings) namespaces = unicode_encode(namespaces) settings = unicode_encode(settings) modified_settings = [] bad_point_strings = [] bad_namespaces = [] with self.lock: for setting in settings: for point_string in point_strings: # Standardise the point and check its validity. bad_point = False try: point_string = standardise_point_string(point_string) except Exception: if point_string != '*': bad_point_strings.append(point_string) bad_point = True if not bad_point and point_string not in self.settings: self.settings[point_string] = {} for namespace in namespaces: if namespace not in self.linearized_ancestors: bad_namespaces.append(namespace) elif not bad_point: if namespace not in self.settings[point_string]: self.settings[point_string][namespace] = {} self._addict( self.settings[point_string][namespace], setting) modified_settings.append( (point_string, namespace, setting)) # Log the broadcast self._append_db_queue(modified_settings) LOG.info(get_broadcast_change_report(modified_settings)) bad_options = {} if bad_point_strings: bad_options["point_strings"] = bad_point_strings if bad_namespaces: bad_options["namespaces"] = bad_namespaces return modified_settings, bad_options
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) if not prepared_tasks: return bad_tasks # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.summary['latest_message'] = self.REMOTE_INIT_MSG continue # Persist if owner: owner_at_host = owner + '@' + host else: owner_at_host = host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and perist LOG.info('submit-num=%d, owner@host=%s' % (itask.submit_num, owner_at_host), itask=itask) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry self.task_events_mgr.log_task_job_activity( SuiteProcContext(self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, self.poll_task_jobs) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if cylc.flags.debug: cmd.append('--debug') remote_mode = False kwargs = {} for key, value, test_func in [('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append( GLOBAL_CFG.get_derived_host_item(suite, 'suite job log directory', host, owner)) stdin_file_paths = [] job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): if remote_mode: stdin_file_paths.append( self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE)) job_log_dirs.append( self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file write # flag so that subsequent manual retrigger will generate a new # job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) cmd += job_log_dirs self.proc_pool.put_command( SuiteProcContext(self.JOBS_SUBMIT, cmd, stdin_file_paths=stdin_file_paths, job_log_dirs=job_log_dirs, **kwargs), self._submit_task_jobs_callback, [suite, itasks]) return done_tasks
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.summary['latest_message'] = self.REMOTE_INIT_MSG continue # Persist if owner: owner_at_host = owner + '@' + host else: owner_at_host = host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and perist LOG.info( 'submit-num=%d, owner@host=%s' % ( itask.submit_num, owner_at_host), itask=itask) self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SuiteProcContext( self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if cylc.flags.debug: cmd.append('--debug') if cylc.flags.utc: cmd.append('--utc-mode') remote_mode = False kwargs = {} for key, value, test_func in [ ('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append(glbl_cfg().get_derived_host_item( suite, 'suite job log directory', host, owner)) stdin_file_paths = [] job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): if remote_mode: stdin_file_paths.append( get_task_job_job_log( suite, itask.point, itask.tdef.name, itask.submit_num)) job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file write # flag so that subsequent manual retrigger will generate a new # job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) cmd += job_log_dirs self.proc_pool.put_command( SuiteProcContext( self.JOBS_SUBMIT, cmd, stdin_file_paths=stdin_file_paths, job_log_dirs=job_log_dirs, **kwargs ), self._submit_task_jobs_callback, [suite, itasks]) return done_tasks
def _run_event_mail_callback(proc_ctx): """Callback the mail command for notification of a suite event.""" if proc_ctx.ret_code: LOG.warning(str(proc_ctx)) else: LOG.info(str(proc_ctx))
def init_host(self, reg, host, owner): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if host is None: host = 'localhost' if ((host, owner) in [('localhost', None), ('localhost', USER)] or (host, owner) in self.init_host_map or self.single_task_mode): return user_at_host = host if owner: user_at_host = owner + '@' + host r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite job log directory', host, owner) r_suite_srv_dir = os.path.join( r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV) # Create a UUID file in the service directory. # If remote host has the file in its service directory, we can assume # that the remote host has a shared file system with the suite host. ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) uuid_str = str(uuid4()) uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str) try: open(uuid_fname, 'wb').close() proc = Popen( shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'test', '-e', os.path.join(r_suite_srv_dir, uuid_str)], stdout=PIPE, stderr=PIPE) if proc.wait() == 0: # Initialised, but no need to tidy up self.init_host_map[(host, owner)] = False return finally: try: os.unlink(uuid_fname) except OSError: pass cmds = [] # Command to create suite directory structure on remote host. cmds.append(shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'mkdir', '-p', r_suite_run_dir, r_log_job_dir, r_suite_srv_dir]) # Command to copy contact and authentication files to remote host. # Note: no need to do this if task communication method is "poll". should_unlink = GLOBAL_CFG.get_host_item( 'task communication method', host, owner) != "poll" if should_unlink: scp_tmpl = GLOBAL_CFG.get_host_item('scp command', host, owner) cmds.append(shlex.split(scp_tmpl) + [ '-p', self.suite_srv_files_mgr.get_contact_file(reg), self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg), self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg), user_at_host + ':' + r_suite_srv_dir + '/']) # Command to copy python library to remote host. suite_run_py = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory'), 'python') if os.path.isdir(suite_run_py): cmds.append(shlex.split(scp_tmpl) + [ '-pr', suite_run_py, user_at_host + ':' + r_suite_run_dir + '/']) # Run commands in sequence. for cmd in cmds: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( RemoteJobHostInitError.MSG_INIT, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err) self.init_host_map[(host, owner)] = should_unlink LOG.info('Initialised %s:%s' % (user_at_host, r_suite_run_dir))
def _prep_submit_task_job_impl(self, suite, itask): """Helper for self._prep_submit_task_job.""" overrides = self.task_events_mgr.broadcast_mgr.get_broadcast( itask.identity) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides) else: rtconfig = itask.tdef.rtconfig # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) # Submit number and try number LOG.debug("[%s] -incrementing submit number" % (itask.identity,)) itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num itask.local_job_file_path = None self.suite_db_mgr.put_insert_task_jobs(itask, { "is_manual_submit": itask.is_manual_submit, "try_num": itask.get_try_num(), "time_submit": get_current_time_string(), }) itask.summary['batch_sys_name'] = rtconfig['job']['batch system'] for name in rtconfig['extra log files']: itask.summary['logfiles'].append(expandvars(name)) # Determine task host settings now, just before job submission, # because dynamic host selection may be used. # host may be None (= run task on suite host) itask.task_host = get_task_host(rtconfig['remote']['host']) if not itask.task_host: itask.task_host = 'localhost' elif itask.task_host != "localhost": LOG.info("[%s] -Task host: %s" % ( itask.identity, itask.task_host)) itask.task_owner = rtconfig['remote']['owner'] if itask.task_owner: user_at_host = itask.task_owner + "@" + itask.task_host else: user_at_host = itask.task_host itask.summary['host'] = user_at_host itask.summary['job_hosts'][itask.submit_num] = user_at_host try: batch_sys_conf = self.task_events_mgr.get_host_conf( itask, 'batch systems')[rtconfig['job']['batch system']] except (TypeError, KeyError): batch_sys_conf = {} try: itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float( rtconfig['job']['execution time limit']) except TypeError: pass if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10 # minutes after time limit exceeded itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = ( TaskActionTimer(delays=batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]))) for label, key in [ ('submission polling intervals', TASK_STATUS_SUBMITTED), ('execution polling intervals', TASK_STATUS_RUNNING)]: if key in itask.poll_timers: itask.poll_timers[key].reset() else: values = self.task_events_mgr.get_host_conf( itask, label, skey='job') if values: itask.poll_timers[key] = TaskActionTimer(delays=values) self.init_host(suite, itask.task_host, itask.task_owner) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.suite_db_mgr.put_update_task_jobs(itask, { "user_at_host": user_at_host, "batch_sys_name": itask.summary['batch_sys_name'], }) itask.is_manual_submit = False scripts = self._get_job_scripts(itask, rtconfig) # Location of job file, etc self._create_job_log_path(suite, itask) job_d = self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num) job_file_path = os.path.join( GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory", itask.task_host, itask.task_owner), job_d, self.JOB_FILE_BASE) return { 'batch_system_name': rtconfig['job']['batch system'], 'batch_submit_command_template': ( rtconfig['job']['batch submit command template']), 'batch_system_conf': batch_sys_conf, 'directives': rtconfig['directives'], 'environment': rtconfig['environment'], 'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT], 'env-script': rtconfig['env-script'], 'err-script': rtconfig['err-script'], 'host': itask.task_host, 'init-script': rtconfig['init-script'], 'job_file_path': job_file_path, 'job_d': job_d, 'namespace_hierarchy': itask.tdef.namespace_hierarchy, 'owner': itask.task_owner, 'param_var': itask.tdef.param_var, 'post-script': scripts[2], 'pre-script': scripts[0], 'remote_suite_d': rtconfig['remote']['suite definition directory'], 'script': scripts[1], 'shell': rtconfig['job']['shell'], 'submit_num': itask.submit_num, 'suite_name': suite, 'task_id': itask.identity, 'try_num': itask.get_try_num(), 'work_d': rtconfig['work sub-directory'], }
def init_host(self, reg, host, owner): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if host is None: host = 'localhost' if (self.single_task_mode or (host, owner) in self.init_host_map or not is_remote(host, owner)): return user_at_host = host if owner: user_at_host = owner + '@' + host r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite job log directory', host, owner) r_suite_srv_dir = os.path.join( r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV) # Create a UUID file in the service directory. # If remote host has the file in its service directory, we can assume # that the remote host has a shared file system with the suite host. ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) uuid_str = str(uuid4()) uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str) try: open(uuid_fname, 'wb').close() proc = Popen( shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'test', '-e', os.path.join(r_suite_srv_dir, uuid_str)], stdout=PIPE, stderr=PIPE) if proc.wait() == 0: # Initialised, but no need to tidy up self.init_host_map[(host, owner)] = False return finally: try: os.unlink(uuid_fname) except OSError: pass cmds = [] # Command to create suite directory structure on remote host. cmds.append(shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'mkdir', '-p', r_suite_run_dir, r_log_job_dir, r_suite_srv_dir]) # Command to copy contact and authentication files to remote host. # Note: no need to do this if task communication method is "poll". should_unlink = GLOBAL_CFG.get_host_item( 'task communication method', host, owner) != "poll" if should_unlink: scp_tmpl = GLOBAL_CFG.get_host_item('scp command', host, owner) # Handle not having SSL certs installed. try: ssl_cert = self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg) except (SuiteServiceFileError, ValueError): ssl_cert = None cmds.append(shlex.split(scp_tmpl) + [ '-p', self.suite_srv_files_mgr.get_contact_file(reg), self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg), user_at_host + ':' + r_suite_srv_dir + '/']) if ssl_cert is not None: cmds[-1].insert(-1, ssl_cert) # Command to copy python library to remote host. suite_run_py = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory'), 'python') if os.path.isdir(suite_run_py): cmds.append(shlex.split(scp_tmpl) + [ '-pr', suite_run_py, user_at_host + ':' + r_suite_run_dir + '/']) # Run commands in sequence. for cmd in cmds: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( RemoteJobHostInitError.MSG_INIT, user_at_host, ' '.join(quote(item) for item in cmd), proc.returncode, out, err) self.init_host_map[(host, owner)] = should_unlink LOG.info('Initialised %s:%s' % (user_at_host, r_suite_run_dir))
def _prep_submit_task_job_impl(self, suite, itask): """Helper for self._prep_submit_task_job.""" overrides = BroadcastServer.get_inst().get(itask.identity) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides) else: rtconfig = itask.tdef.rtconfig # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) # Submit number and try number LOG.debug("[%s] -incrementing submit number" % (itask.identity,)) itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num itask.local_job_file_path = None self.suite_db_mgr.put_insert_task_jobs(itask, { "is_manual_submit": itask.is_manual_submit, "try_num": itask.get_try_num(), "time_submit": get_current_time_string(), }) itask.summary['batch_sys_name'] = rtconfig['job']['batch system'] for name in rtconfig['extra log files']: itask.summary['logfiles'].append(expandvars(name)) # Determine task host settings now, just before job submission, # because dynamic host selection may be used. # host may be None (= run task on suite host) itask.task_host = get_task_host(rtconfig['remote']['host']) if not itask.task_host: itask.task_host = 'localhost' elif itask.task_host != "localhost": LOG.info("[%s] -Task host: %s" % ( itask.identity, itask.task_host)) itask.task_owner = rtconfig['remote']['owner'] if itask.task_owner: user_at_host = itask.task_owner + "@" + itask.task_host else: user_at_host = itask.task_host itask.summary['host'] = user_at_host itask.summary['job_hosts'][itask.submit_num] = user_at_host try: batch_sys_conf = self.task_events_mgr.get_host_conf( itask, 'batch systems')[rtconfig['job']['batch system']] except (TypeError, KeyError): batch_sys_conf = {} try: itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float( rtconfig['job']['execution time limit']) except TypeError: pass if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10 # minutes after time limit exceeded itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = ( TaskActionTimer(delays=batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]))) for label, key in [ ('submission polling intervals', TASK_STATUS_SUBMITTED), ('execution polling intervals', TASK_STATUS_RUNNING)]: if key in itask.poll_timers: itask.poll_timers[key].reset() else: values = self.task_events_mgr.get_host_conf( itask, label, skey='job') if values: itask.poll_timers[key] = TaskActionTimer(delays=values) self.init_host(suite, itask.task_host, itask.task_owner) self.suite_db_mgr.put_update_task_jobs(itask, { "user_at_host": user_at_host, "batch_sys_name": itask.summary['batch_sys_name'], }) itask.is_manual_submit = False scripts = self._get_job_scripts(itask, rtconfig) # Location of job file, etc self._create_job_log_path(suite, itask) job_d = self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num) job_file_path = os.path.join( GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory", itask.task_host, itask.task_owner), job_d, self.JOB_FILE_BASE) return { 'batch_system_name': rtconfig['job']['batch system'], 'batch_submit_command_template': ( rtconfig['job']['batch submit command template']), 'batch_system_conf': batch_sys_conf, 'directives': rtconfig['directives'], 'environment': rtconfig['environment'], 'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT], 'env-script': rtconfig['env-script'], 'err-script': rtconfig['err-script'], 'host': itask.task_host, 'init-script': rtconfig['init-script'], 'job_file_path': job_file_path, 'job_d': job_d, 'namespace_hierarchy': itask.tdef.namespace_hierarchy, 'owner': itask.task_owner, 'param_var': itask.tdef.param_var, 'post-script': scripts[2], 'pre-script': scripts[0], 'remote_suite_d': rtconfig['remote']['suite definition directory'], 'script': scripts[1], 'shell': rtconfig['job']['shell'], 'submit_num': itask.submit_num, 'suite_name': suite, 'task_id': itask.identity, 'try_num': itask.get_try_num(), 'work_d': rtconfig['work sub-directory'], }
def _reset_job_timers(self, itask): """Set up poll timer and timeout for task.""" if itask.state.status not in TASK_STATUSES_ACTIVE: # Reset, task not active itask.timeout = None itask.poll_timer = None return ctx = (itask.submit_num, itask.state.status) if itask.poll_timer and itask.poll_timer.ctx == ctx: return # Set poll timer # Set timeout timeref = None # reference time, submitted or started time timeout = None # timeout in setting if itask.state.status == TASK_STATUS_RUNNING: timeref = itask.summary['started_time'] timeout_key = 'execution timeout' timeout = self._get_events_conf(itask, timeout_key) delays = self.get_host_conf( itask, 'execution polling intervals', skey='job', default=[900]) # Default 15 minute intervals if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: time_limit = itask.summary[self.KEY_EXECUTE_TIME_LIMIT] try: host_conf = self.get_host_conf(itask, 'batch systems') batch_sys_conf = host_conf[itask.summary['batch_sys_name']] except (TypeError, KeyError): batch_sys_conf = {} time_limit_delays = batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]) timeout = time_limit + sum(time_limit_delays) # Remove execessive polling before time limit while sum(delays) > time_limit: del delays[-1] # But fill up the gap before time limit if delays: size = int((time_limit - sum(delays)) / delays[-1]) delays.extend([delays[-1]] * size) time_limit_delays[0] += time_limit - sum(delays) delays += time_limit_delays else: # if itask.state.status == TASK_STATUS_SUBMITTED: timeref = itask.summary['submitted_time'] timeout_key = 'submission timeout' timeout = self._get_events_conf(itask, timeout_key) delays = self.get_host_conf( itask, 'submission polling intervals', skey='job', default=[900]) # Default 15 minute intervals try: itask.timeout = timeref + float(timeout) timeout_str = intvl_as_str(timeout) except (TypeError, ValueError): itask.timeout = None timeout_str = None itask.poll_timer = TaskActionTimer(ctx=ctx, delays=delays) # Log timeout and polling schedule message = 'health check settings: %s=%s' % (timeout_key, timeout_str) # Attempt to group idenitical consecutive delays as N*DELAY,... if itask.poll_timer.delays: items = [] # [(number of item - 1, item), ...] for delay in itask.poll_timer.delays: if items and items[-1][1] == delay: items[-1][0] += 1 else: items.append([0, delay]) message += ', polling intervals=' for num, item in items: if num: message += '%d*' % (num + 1) message += '%s,' % intvl_as_str(item) message += '...' LOG.info(message, itask=itask) # Set next poll time self.check_poll_time(itask)
def init_suite_run_dir(self, reg, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if "@" in user_at_host: owner, host = user_at_host.split("@", 1) else: owner, host = None, user_at_host if ( (owner, host) in [(None, "localhost"), (USER, "localhost")] or host in self.initialised_hosts or self.single_task_mode ): return r_suite_run_dir = GLOBAL_CFG.get_derived_host_item(reg, "suite run directory", host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item(reg, "suite job log directory", host, owner) r_suite_srv_dir = os.path.join(r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV) # Create a UUID file in the service directory. # If remote host has the file in its service directory, we can assume # that the remote host has a shared file system with the suite host. ssh_tmpl = GLOBAL_CFG.get_host_item("remote shell template", host, owner) uuid_str = str(uuid4()) uuid_fname = os.path.join(self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str) try: open(uuid_fname, "wb").close() proc = Popen( shlex.split(ssh_tmpl) + ["-n", user_at_host, "test", "-e", os.path.join(r_suite_srv_dir, uuid_str)], stdout=PIPE, stderr=PIPE, ) if proc.wait() == 0: # Initialised, but no need to tidy up self.initialised_hosts[user_at_host] = False return finally: try: os.unlink(uuid_fname) except OSError: pass cmds = [] # Command to create suite directory structure on remote host. cmds.append( shlex.split(ssh_tmpl) + ["-n", user_at_host, "mkdir", "-p", r_suite_run_dir, r_log_job_dir, r_suite_srv_dir] ) # Command to copy contact and authentication files to remote host. # Note: no need to do this if task communication method is "poll". should_unlink = GLOBAL_CFG.get_host_item("task communication method", host, owner) != "poll" if should_unlink: scp_tmpl = GLOBAL_CFG.get_host_item("remote copy template", host, owner) cmds.append( shlex.split(scp_tmpl) + [ "-p", self.suite_srv_files_mgr.get_contact_file(reg), self.suite_srv_files_mgr.get_auth_item(self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg), self.suite_srv_files_mgr.get_auth_item(self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg), user_at_host + ":" + r_suite_srv_dir + "/", ] ) # Command to copy python library to remote host. suite_run_py = os.path.join(GLOBAL_CFG.get_derived_host_item(reg, "suite run directory"), "python") if os.path.isdir(suite_run_py): cmds.append(shlex.split(scp_tmpl) + ["-pr", suite_run_py, user_at_host + ":" + r_suite_run_dir + "/"]) # Run commands in sequence. for cmd in cmds: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( RemoteJobHostInitError.MSG_INIT, user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err, ) self.initialised_hosts[user_at_host] = should_unlink LOG.info("Initialised %s:%s" % (user_at_host, r_suite_run_dir))