def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if LOG.isEnabledFor(DEBUG): cmd.append("--debug") if is_remote_host(host): cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append(glbl_cfg().get_derived_host_item( suite, "suite job log directory", host, owner)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command( SubProcContext(cmd_key, cmd), callback, [suite, itasks])
def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message.""" if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) self.job_pool.set_job_time(job_d, 'finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) if (TASK_STATUS_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_RETRYING].next() is None): # No retry lined up: definitive failure. self.pflag = True if itask.state.reset(TASK_STATUS_FAILED): self.setup_event_handlers(itask, "failed", message) self.job_pool.set_job_state(job_d, TASK_STATUS_FAILED) LOG.critical("[%s] -job(%02d) %s", itask, itask.submit_num, "failed") elif itask.state.reset(TASK_STATUS_RETRYING): delay_msg = "retrying in %s" % ( itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str()) if itask.state.is_held: delay_msg = "held (%s)" % delay_msg msg = "failed, %s" % (delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) self.setup_event_handlers(itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg)) self._reset_job_timers(itask)
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs(itask, { "time_submit_exit": event_time, "submit_status": 1, }) job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) self.job_pool.set_job_attr(job_d, 'batch_sys_job_id', None) itask.summary['submit_method_id'] = None self.pflag = True if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. # See github #476. if itask.state.reset(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) self.job_pool.set_job_state(job_d, TASK_STATUS_SUBMIT_FAILED) elif itask.state.reset(TASK_STATUS_SUBMIT_RETRYING, ): # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() if itask.state.is_held: delay_msg = "held (%s)" % delay_msg msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self.job_pool.set_job_state(job_d, TASK_STATUS_SUBMIT_RETRYING) self._reset_job_timers(itask)
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if LOG.isEnabledFor(DEBUG): cmd.append("--debug") if is_remote_host(host): cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append(get_remote_suite_run_job_dir(host, owner, suite)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command(SubProcContext(cmd_key, cmd), callback, [suite, itasks])
def _submit_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _submit_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_SUBMIT, None) ctx.out = line items = line.split("|") try: ctx.timestamp, _, ctx.ret_code = items[0:3] except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) if ctx.ret_code == SubProcPool.RET_CODE_SUITE_STOPPING: return job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) try: itask.summary['submit_method_id'] = items[3] self.job_pool.set_job_attr(job_d, 'batch_sys_job_id', items[3]) except IndexError: itask.summary['submit_method_id'] = None if itask.summary['submit_method_id'] == "None": itask.summary['submit_method_id'] = None if itask.summary['submit_method_id'] and ctx.ret_code == 0: self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_SUBMITTED, ctx.timestamp) else: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp)
def _process_message_succeeded(self, itask, event_time): """Helper for process_message, handle a succeeded message.""" job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) self.job_pool.set_job_time(job_d, 'finished', event_time) self.pflag = True itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 0, "time_run_exit": event_time, }) # Update mean elapsed time only on task succeeded. if itask.summary['started_time'] is not None: itask.tdef.elapsed_times.append(itask.summary['finished_time'] - itask.summary['started_time']) if not itask.state.outputs.all_completed(): msg = "" for output in itask.state.outputs.get_not_completed(): if output not in [ TASK_OUTPUT_EXPIRED, TASK_OUTPUT_SUBMIT_FAILED, TASK_OUTPUT_FAILED ]: msg += "\n " + output if msg: LOG.info("[%s] -Succeeded with outputs not completed: %s", itask, msg) if itask.state.reset(TASK_STATUS_SUCCEEDED): self.setup_event_handlers(itask, "succeeded", "job succeeded") self.job_pool.set_job_state(job_d, TASK_STATUS_SUCCEEDED) self._reset_job_timers(itask)
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message. Return True if no retries (hence go to the submit-failed state). """ no_retries = False LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs(itask, { "time_submit_exit": event_time, "submit_status": 1, }) job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) self.job_pool.set_job_state(job_d, TASK_STATUS_SUBMIT_FAILED) itask.summary['submit_method_id'] = None self.pflag = True if ( TimerFlags.SUBMISSION_RETRY not in itask.try_timers or itask.try_timers[TimerFlags.SUBMISSION_RETRY].next() is None ): # No submission retry lined up: definitive failure. # See github #476. no_retries = True if itask.state.reset(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers( itask, self.EVENT_SUBMIT_FAILED, f'job {self.EVENT_SUBMIT_FAILED}') else: # There is a submission retry lined up. timer = itask.try_timers[TimerFlags.SUBMISSION_RETRY] self._retry_task(itask, timer.timeout, submit_retry=True) delay_msg = f"submit-retrying in {timer.delay_timeout_as_str()}" if itask.state.is_held: delay_msg = f"held ({delay_msg})" msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, f"job {self.EVENT_SUBMIT_FAILED}, {delay_msg}") self._reset_job_timers(itask) return no_retries
def _process_message_submitted(self, itask, event_time): """Helper for process_message, handle a submit-succeeded message.""" try: LOG.info( '[%s] -job[%02d] submitted to %s:%s[%s]', itask, itask.summary['submit_num'], itask.summary['platforms_used'][itask.summary['submit_num']], itask.summary['batch_sys_name'], itask.summary['submit_method_id']) except KeyError: pass self.suite_db_mgr.put_update_task_jobs( itask, { "time_submit_exit": event_time, "submit_status": 0, "batch_sys_job_id": itask.summary.get('submit_method_id') }) if itask.tdef.run_mode == 'simulation': # Simulate job execution at this point. itask.set_summary_time('submitted', event_time) itask.set_summary_time('started', event_time) itask.state.reset(TASK_STATUS_RUNNING) itask.state.outputs.set_completion(TASK_OUTPUT_STARTED, True) return itask.set_summary_time('submitted', event_time) job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) self.job_pool.set_job_time(job_d, 'submitted', event_time) self.job_pool.set_job_state(job_d, TASK_STATUS_SUBMITTED) # Unset started and finished times in case of resubmission. itask.set_summary_time('started') itask.set_summary_time('finished') itask.set_summary_message(TASK_OUTPUT_SUBMITTED) self.pflag = True if itask.state.status == TASK_STATUS_READY: # The job started message can (rarely) come in before the submit # command returns - in which case do not go back to 'submitted'. if itask.state.reset(TASK_STATUS_SUBMITTED): self.setup_event_handlers(itask, self.EVENT_SUBMITTED, f'job {self.EVENT_SUBMITTED}') self._reset_job_timers(itask)
def _process_message_started(self, itask, event_time): """Helper for process_message, handle a started message.""" if itask.job_vacated: itask.job_vacated = False LOG.warning("[%s] -Vacated job restarted", itask) self.pflag = True job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) if itask.state.reset(TASK_STATUS_RUNNING): self.setup_event_handlers(itask, 'started', 'job started') self.job_pool.set_job_state(job_d, TASK_STATUS_RUNNING) itask.set_summary_time('started', event_time) self.job_pool.set_job_time(job_d, 'started', event_time) self._reset_job_timers(itask) self.suite_db_mgr.put_update_task_jobs( itask, {"time_run": itask.summary['started_time_string']}) # submission was successful so reset submission try number if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message. Return True if no retries (hence go to the failed state). """ no_retries = False if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) job_d = get_task_job_id( itask.point, itask.tdef.name, itask.submit_num) self.job_pool.set_job_time(job_d, 'finished', event_time) self.job_pool.set_job_state(job_d, TASK_STATUS_FAILED) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) self.pflag = True if ( TimerFlags.EXECUTION_RETRY not in itask.try_timers or itask.try_timers[TimerFlags.EXECUTION_RETRY].next() is None ): # No retry lined up: definitive failure. if itask.state.reset(TASK_STATUS_FAILED): self.setup_event_handlers(itask, self.EVENT_FAILED, message) LOG.critical( "[%s] -job(%02d) %s", itask, itask.submit_num, "failed") no_retries = True else: # There is an execution retry lined up. timer = itask.try_timers[TimerFlags.EXECUTION_RETRY] self._retry_task(itask, timer.timeout) delay_msg = f"retrying in {timer.delay_timeout_as_str()}" if itask.state.is_held: delay_msg = "held (%s)" % delay_msg msg = "failed, %s" % (delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) self.setup_event_handlers( itask, self.EVENT_RETRY, f"{self.JOB_FAILED}, {delay_msg}") self._reset_job_timers(itask) return no_retries
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their platform_name and host. Put a job command for each group to the multiprocess pool. """ if not itasks: return # sort itasks into lists based upon where they were run. auth_itasks = {} for itask in itasks: platform_n = itask.platform['name'] if platform_n not in auth_itasks: auth_itasks[platform_n] = [] auth_itasks[platform_n].append(itask) # Go through each list of itasks and carry out commands as required. for platform_n, itasks in sorted(auth_itasks.items()): platform = get_platform(platform_n) if is_remote_platform(platform): remote_mode = True cmd = [cmd_key] else: cmd = ["cylc", cmd_key] remote_mode = False if LOG.isEnabledFor(DEBUG): cmd.append("--debug") cmd.append("--") cmd.append(get_remote_suite_run_job_dir(platform, suite)) job_log_dirs = [] if remote_mode: cmd = construct_ssh_cmd(cmd, platform) for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command(SubProcContext(cmd_key, cmd), callback, [suite, itasks])
def _kill_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _kill_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_KILL, None) ctx.out = line try: ctx.timestamp, _, ctx.ret_code = line.split("|", 2) except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) log_lvl = INFO log_msg = 'killed' if ctx.ret_code: # non-zero exit status log_lvl = WARNING log_msg = 'kill failed' itask.state.kill_failed = True elif itask.state(TASK_STATUS_SUBMITTED): self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp) elif itask.state(TASK_STATUS_RUNNING): self.task_events_mgr.process_message(itask, CRITICAL, TASK_OUTPUT_FAILED) else: log_lvl = DEBUG log_msg = ('ignoring job kill result, unexpected task state: %s' % itask.state.status) itask.set_summary_message(log_msg) self.job_pool.add_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), log_msg) LOG.log( log_lvl, "[%s] -job(%02d) %s" % (itask.identity, itask.submit_num, log_msg))
def _prep_submit_task_job_impl(self, suite, itask, rtconfig): """Helper for self._prep_submit_task_job.""" itask.task_owner = rtconfig['remote']['owner'] if itask.task_owner: owner_at_host = itask.task_owner + "@" + itask.task_host else: owner_at_host = itask.task_host itask.summary['host'] = owner_at_host itask.summary['job_hosts'][itask.submit_num] = owner_at_host itask.summary['batch_sys_name'] = rtconfig['job']['batch system'] for name in rtconfig['extra log files']: itask.summary['logfiles'].append( os.path.expanduser(os.path.expandvars(name))) try: batch_sys_conf = self.task_events_mgr.get_host_conf( itask, 'batch systems')[itask.summary['batch_sys_name']] except (TypeError, KeyError): batch_sys_conf = {} try: itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float( rtconfig['job']['execution time limit']) except TypeError: pass scripts = self._get_job_scripts(itask, rtconfig) # Location of job file, etc self._create_job_log_path(suite, itask) job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) job_file_path = get_remote_suite_run_job_dir(itask.task_host, itask.task_owner, suite, job_d, JOB_LOG_JOB) return { 'batch_system_name': rtconfig['job']['batch system'], 'batch_submit_command_template': (rtconfig['job']['batch submit command template']), 'batch_system_conf': batch_sys_conf, 'dependencies': itask.state.get_resolved_dependencies(), 'directives': rtconfig['directives'], 'environment': rtconfig['environment'], 'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT], 'env-script': rtconfig['env-script'], 'err-script': rtconfig['err-script'], 'exit-script': rtconfig['exit-script'], 'host': itask.task_host, 'init-script': rtconfig['init-script'], 'job_file_path': job_file_path, 'job_d': job_d, 'namespace_hierarchy': itask.tdef.namespace_hierarchy, 'owner': itask.task_owner, 'param_env_tmpl': rtconfig['parameter environment templates'], 'param_var': itask.tdef.param_var, 'post-script': scripts[2], 'pre-script': scripts[0], 'remote_suite_d': rtconfig['remote']['suite definition directory'], 'script': scripts[1], 'submit_num': itask.submit_num, 'suite_name': suite, 'task_id': itask.identity, 'try_num': itask.get_try_num(), 'uuid_str': self.task_remote_mgr.uuid_str, 'work_d': rtconfig['work sub-directory'], }
def process_message( self, itask, severity, message, event_time=None, flag=FLAG_INTERNAL, submit_num=None, ): """Parse an task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for the current state of a task to be inconsistent with a message (whether internal, received or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. Arguments: itask (cylc.flow.task_proxy.TaskProxy): The task proxy object relevant for the message. severity (str or int): Message severity, should be a recognised logging level. message (str): Message content. event_time (str): Event time stamp. Expect ISO8601 date time string. If not specified, use current time. flag (str): If specified, can be: FLAG_INTERNAL (default): To indicate an internal message. FLAG_RECEIVED: To indicate a message received from a job or an external source. FLAG_POLLED: To indicate a message resulted from a poll. submit_num (int): The submit number of the task relevant for the message. If not specified, use latest submit number. Return: None: in normal circumstances. True: if polling is required to confirm a reversal of status. """ # Log messages if event_time is None: event_time = get_current_time_string() if submit_num is None: submit_num = itask.submit_num if not self._process_message_check(itask, severity, message, event_time, flag, submit_num): return None # always update the suite state summary for latest message if flag == self.FLAG_POLLED: new_msg = f'{message} {self.FLAG_POLLED}' else: new_msg = message itask.set_summary_message(new_msg) self.job_pool.add_job_msg( get_task_job_id(itask.point, itask.tdef.name, submit_num), new_msg) # Satisfy my output, if possible, and record the result. completed_trigger = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if (flag == self.FLAG_RECEIVED and itask.state.is_gt(TASK_STATUS_RUNNING)): return True self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if (flag == self.FLAG_RECEIVED and itask.state.is_gt(TASK_STATUS_FAILED)): return True self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if (flag == self.FLAG_RECEIVED and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)): return True self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if (flag == self.FLAG_RECEIVED and itask.state.is_gt(TASK_STATUS_SUBMITTED)): return True self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. if (flag == self.FLAG_RECEIVED and itask.state.is_gt(TASK_STATUS_FAILED)): return True signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs(itask, {"run_signal": signal}) self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message if (flag == self.FLAG_RECEIVED and itask.state.is_gt(TASK_STATUS_FAILED)): return True aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset(TASK_STATUS_SUBMITTED) self._reset_job_timers(itask) # We should really have a special 'vacated' handler, but given that # this feature can only be used on the deprecated loadleveler # system, we should probably aim to remove support for job vacation # instead. Otherwise, we should have: # self.setup_event_handlers(itask, 'vacated', message) elif completed_trigger: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) self.setup_event_handlers(itask, completed_trigger, message) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug('[%s] status=%s: unhandled: %s', itask, itask.state.status, message) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert(itask, ("message %s" % str(severity).lower()), message) lseverity = str(severity).lower() if lseverity in self.NON_UNIQUE_EVENTS: itask.non_unique_events.setdefault(lseverity, 0) itask.non_unique_events[lseverity] += 1 self.setup_event_handlers(itask, lseverity, message) return None
def submit_task_jobs(self, suite, itasks, curve_auth, client_pub_key_dir, is_simulation=False): """Prepare for job submission and submit task jobs. Preparation (host selection, remote host init, and remote install) is done asynchronously. Newly released tasks may be sent here several times until these init subprocesses have returned. Failure during preparation is considered to be job submission failure. Once preparation has completed or failed, reset .waiting_on_job_prep in task instances so the scheduler knows to stop sending them back here. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.subshell_eval_reset() if not prepared_tasks: return bad_tasks auth_itasks = {} # {platform: [itask, ...], ...} for itask in prepared_tasks: platform_name = itask.platform['name'] auth_itasks.setdefault(platform_name, []) auth_itasks[platform_name].append(itask) # Submit task jobs for each platform done_tasks = bad_tasks for platform_name, itasks in sorted(auth_itasks.items()): platform = itasks[0].platform install_target = get_install_target_from_platform(platform) ri_map = self.task_remote_mgr.remote_init_map if (ri_map.get(install_target) != REMOTE_FILE_INSTALL_DONE): if install_target == get_localhost_install_target(): # Skip init and file install for localhost. LOG.debug(f"REMOTE INIT NOT REQUIRED for {install_target}") ri_map[install_target] = (REMOTE_FILE_INSTALL_DONE) elif install_target not in ri_map: # Remote init not in progress for target, so start it. self.task_remote_mgr.remote_init(platform, curve_auth, client_pub_key_dir) for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) self.data_store_mgr.delta_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), self.REMOTE_INIT_MSG) continue elif (ri_map[install_target] == REMOTE_INIT_DONE): # Already done remote init so move on to file install self.task_remote_mgr.file_install(platform) continue elif (ri_map[install_target] in self.IN_PROGRESS.keys()): # Remote init or file install in progress. for itask in itasks: msg = self.IN_PROGRESS[ri_map[install_target]] itask.set_summary_message(msg) self.data_store_mgr.delta_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), msg) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. host = get_host_from_platform(platform) if (self.job_runner_mgr.is_job_local_to_host( itask.summary['job_runner_name']) and not is_remote_platform(platform)): host = get_host() now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info('[%s] -submit-num=%02d, host=%s', itask, itask.submit_num, host) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'platform_name': itask.platform['name'], 'job_runner_name': itask.summary['job_runner_name'], }) itask.is_manual_submit = False if (ri_map[install_target] in [REMOTE_INIT_FAILED, REMOTE_FILE_INSTALL_FAILED]): # Remote init or install failed. Set submit-failed for all # affected tasks and remove target from remote init map # - this enables new tasks to re-initialise that target init_error = (ri_map[install_target]) del ri_map[install_target] for itask in itasks: itask.waiting_on_job_prep = False itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, '(init %s)' % host, err=init_error, ret_code=1), suite, itask.point, itask.tdef.name) self._prep_submit_task_job_error(suite, itask, '(remote init)', '') continue # Build the "cylc jobs-submit" command cmd = [self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') if is_remote_platform(itask.platform): remote_mode = True cmd.append('--remote-mode') else: remote_mode = False if itask.platform['clean job submission environment']: cmd.append('--clean-env') for var in itask.platform[ 'job submission environment pass-through']: cmd.append(f"--env={var}") for path in itask.platform[ 'job submission executable paths'] + SYSPATH: cmd.append(f"--path={path}") cmd.append('--') cmd.append(get_remote_suite_run_job_dir(platform, suite)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = (len(itasks) // ( (len(itasks) // platform['max batch submit size']) + 1) + 1) itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size) ] LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) if remote_mode: cmd = construct_ssh_cmd(cmd, platform) else: cmd = ['cylc'] + cmd for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( os.path.expandvars( get_task_job_job_log(suite, itask.point, itask.tdef.name, itask.submit_num))) job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) itask.waiting_on_job_prep = False self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def _poll_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _poll_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_POLL, None) ctx.out = line ctx.ret_code = 0 # See cylc.flow.batch_sys_manager.JobPollContext job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) try: job_log_dir, context = line.split('|')[1:3] items = json.loads(context) jp_ctx = JobPollContext(job_log_dir, **items) except TypeError: itask.set_summary_message(self.POLL_FAIL) self.job_pool.add_job_msg(job_d, self.POLL_FAIL) ctx.cmd = cmd_ctx.cmd # print original command on failure return except ValueError: # back compat for cylc 7.7.1 and previous try: values = line.split('|') items = dict( # done this way to ensure IndexError is raised (key, values[x]) for x, key in enumerate(JobPollContext.CONTEXT_ATTRIBUTES)) job_log_dir = items.pop('job_log_dir') except (ValueError, IndexError): itask.set_summary_message(self.POLL_FAIL) self.job_pool.add_job_msg(job_d, self.POLL_FAIL) ctx.cmd = cmd_ctx.cmd # print original command on failure return finally: log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) flag = self.task_events_mgr.FLAG_POLLED if jp_ctx.run_status == 1 and jp_ctx.run_signal in ["ERR", "EXIT"]: # Failed normally self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag) elif jp_ctx.run_status == 1 and jp_ctx.batch_sys_exit_polled == 1: # Failed by a signal, and no longer in batch system self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag) self.task_events_mgr.process_message( itask, INFO, FAIL_MESSAGE_PREFIX + jp_ctx.run_signal, jp_ctx.time_run_exit, flag) elif jp_ctx.run_status == 1: # The job has terminated, but is still managed by batch system. # Some batch system may restart a job in this state, so don't # mark as failed yet. self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag) elif jp_ctx.run_status == 0: # The job succeeded self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_SUCCEEDED, jp_ctx.time_run_exit, flag) elif jp_ctx.time_run and jp_ctx.batch_sys_exit_polled == 1: # The job has terminated without executing the error trap self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_FAILED, get_current_time_string(), flag) elif jp_ctx.time_run: # The job has started, and is still managed by batch system self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag) elif jp_ctx.batch_sys_exit_polled == 1: # The job never ran, and no longer in batch system self.task_events_mgr.process_message( itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED, jp_ctx.time_submit_exit, flag) else: # The job never ran, and is in batch system self.task_events_mgr.process_message(itask, INFO, TASK_STATUS_SUBMITTED, jp_ctx.time_submit_exit, flag)
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) self.job_pool.add_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), self.REMOTE_INIT_MSG) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. if (self.batch_sys_mgr.is_job_local_to_host( itask.summary['batch_sys_name']) and not is_remote_host(host)): owner_at_host = get_host() else: owner_at_host = host # Persist if owner: owner_at_host = owner + '@' + owner_at_host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info('[%s] -submit-num=%02d, owner@host=%s', itask, itask.submit_num, owner_at_host) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') remote_mode = False kwargs = {} for key, value, test_func in [('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append(get_remote_suite_run_job_dir(host, owner, suite)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1 itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size) ] LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( get_task_job_job_log(suite, itask.point, itask.tdef.name, itask.submit_num)) job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None itask.state.reset(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.proc_pool.put_command( SubProcContext(self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, **kwargs), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def _prep_submit_task_job_impl(self, suite, itask, rtconfig): """Helper for self._prep_submit_task_job.""" itask.task_owner = rtconfig['remote']['owner'] if itask.task_owner: owner_at_host = itask.task_owner + "@" + itask.task_host else: owner_at_host = itask.task_host itask.summary['host'] = owner_at_host itask.summary['job_hosts'][itask.submit_num] = owner_at_host itask.summary['batch_sys_name'] = rtconfig['job']['batch system'] for name in rtconfig['extra log files']: itask.summary['logfiles'].append( os.path.expanduser(os.path.expandvars(name))) try: batch_sys_conf = self.task_events_mgr.get_host_conf( itask, 'batch systems')[itask.summary['batch_sys_name']] except (TypeError, KeyError): batch_sys_conf = {} try: itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float( rtconfig['job']['execution time limit']) except TypeError: pass scripts = self._get_job_scripts(itask, rtconfig) # Location of job file, etc self._create_job_log_path(suite, itask) job_d = get_task_job_id( itask.point, itask.tdef.name, itask.submit_num) job_file_path = os.path.join( glbl_cfg().get_derived_host_item( suite, "suite job log directory", itask.task_host, itask.task_owner), job_d, JOB_LOG_JOB) return { 'batch_system_name': rtconfig['job']['batch system'], 'batch_submit_command_template': ( rtconfig['job']['batch submit command template']), 'batch_system_conf': batch_sys_conf, 'dependencies': itask.state.get_resolved_dependencies(), 'directives': rtconfig['directives'], 'environment': rtconfig['environment'], 'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT], 'env-script': rtconfig['env-script'], 'err-script': rtconfig['err-script'], 'exit-script': rtconfig['exit-script'], 'host': itask.task_host, 'init-script': rtconfig['init-script'], 'job_file_path': job_file_path, 'job_d': job_d, 'namespace_hierarchy': itask.tdef.namespace_hierarchy, 'owner': itask.task_owner, 'param_env_tmpl': rtconfig['parameter environment templates'], 'param_var': itask.tdef.param_var, 'post-script': scripts[2], 'pre-script': scripts[0], 'remote_suite_d': rtconfig['remote']['suite definition directory'], 'script': scripts[1], 'submit_num': itask.submit_num, 'suite_name': suite, 'task_id': itask.identity, 'try_num': itask.get_try_num(), 'uuid_str': self.task_remote_mgr.uuid_str, 'work_d': rtconfig['work sub-directory'], }
def _poll_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _poll_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_POLL, None) ctx.out = line ctx.ret_code = 0 # See cylc.flow.job_runner_mgr.JobPollContext job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) try: job_log_dir, context = line.split('|')[1:3] items = json.loads(context) jp_ctx = JobPollContext(job_log_dir, **items) except TypeError: itask.set_summary_message(self.POLL_FAIL) self.data_store_mgr.delta_job_msg(job_d, self.POLL_FAIL) ctx.cmd = cmd_ctx.cmd # print original command on failure return except ValueError: itask.set_summary_message(self.POLL_FAIL) self.data_store_mgr.delta_job_msg(job_d, self.POLL_FAIL) ctx.cmd = cmd_ctx.cmd # print original command on failure return finally: log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) flag = self.task_events_mgr.FLAG_POLLED if jp_ctx.run_status == 1 and jp_ctx.run_signal in ["ERR", "EXIT"]: # Failed normally self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag) elif jp_ctx.run_status == 1 and jp_ctx.job_runner_exit_polled == 1: # Failed by a signal, and no longer in job runner self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag) self.task_events_mgr.process_message( itask, INFO, FAIL_MESSAGE_PREFIX + jp_ctx.run_signal, jp_ctx.time_run_exit, flag) elif jp_ctx.run_status == 1: # The job has terminated, but is still managed by job runner. # Some job runners may restart a job in this state, so don't # mark as failed yet. self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag) elif jp_ctx.run_status == 0: # The job succeeded self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_SUCCEEDED, jp_ctx.time_run_exit, flag) elif jp_ctx.time_run and jp_ctx.job_runner_exit_polled == 1: # The job has terminated without executing the error trap self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_FAILED, get_current_time_string(), flag) elif jp_ctx.time_run: # The job has started, and is still managed by job runner self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag) elif jp_ctx.job_runner_exit_polled == 1: # The job never ran, and no longer in job runner self.task_events_mgr.process_message( itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED, jp_ctx.time_submit_exit, flag) else: # The job never ran, and is in job runner self.task_events_mgr.process_message(itask, INFO, TASK_STATUS_SUBMITTED, jp_ctx.time_submit_exit, flag)
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. if ( self.batch_sys_mgr.is_job_local_to_host( itask.summary['batch_sys_name']) and not is_remote_host(host) ): owner_at_host = get_host() else: owner_at_host = host # Persist if owner: owner_at_host = owner + '@' + owner_at_host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info( '[%s] -submit-num=%02d, owner@host=%s', itask, itask.submit_num, owner_at_host) self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext( self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') remote_mode = False kwargs = {} for key, value, test_func in [ ('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append(glbl_cfg().get_derived_host_item( suite, 'suite job log directory', host, owner)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1 itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size)] LOG.debug( '%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( get_task_job_job_log( suite, itask.point, itask.tdef.name, itask.submit_num)) job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, **kwargs ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def _prep_submit_task_job_impl(self, suite, itask, rtconfig): """Helper for self._prep_submit_task_job.""" itask.summary['platforms_used'][ itask.submit_num] = itask.platform['name'] itask.summary['job_runner_name'] = itask.platform['job runner'] try: itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float( rtconfig['execution time limit']) except TypeError: pass scripts = self._get_job_scripts(itask, rtconfig) # Location of job file, etc self._create_job_log_path(suite, itask) job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) job_file_path = get_remote_suite_run_job_dir(itask.platform, suite, job_d, JOB_LOG_JOB) return { 'job_runner_name': itask.platform['job runner'], 'job_runner_command_template': (itask.platform['job runner command template']), 'dependencies': itask.state.get_resolved_dependencies(), 'directives': rtconfig['directives'], 'environment': rtconfig['environment'], 'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT], 'env-script': rtconfig['env-script'], 'err-script': rtconfig['err-script'], 'exit-script': rtconfig['exit-script'], 'platform': itask.platform, 'init-script': rtconfig['init-script'], 'job_file_path': job_file_path, 'job_d': job_d, 'namespace_hierarchy': itask.tdef.namespace_hierarchy, 'param_var': itask.tdef.param_var, 'post-script': scripts[2], 'pre-script': scripts[0], 'remote_suite_d': itask.platform['suite definition directory'], 'script': scripts[1], 'submit_num': itask.submit_num, 'flow_label': itask.flow_label, 'suite_name': suite, 'task_id': itask.identity, 'try_num': itask.get_try_num(), 'uuid_str': self.task_remote_mgr.uuid_str, 'work_d': rtconfig['work sub-directory'], }