Example #1
0
 def _process_message_failed(self, itask, event_time, message):
     """Helper for process_message, handle a failed message."""
     if event_time is None:
         event_time = get_current_time_string()
     itask.set_summary_time('finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 1,
         "time_run_exit": event_time,
     })
     if (TASK_STATUS_RETRYING not in itask.try_timers or
             itask.try_timers[TASK_STATUS_RETRYING].next() is None):
         # No retry lined up: definitive failure.
         self.pflag = True
         if itask.state.reset_state(TASK_STATUS_FAILED):
             self.setup_event_handlers(itask, "failed", message)
         LOG.critical("job(%02d) %s" % (
             itask.submit_num, "failed"), itask=itask)
     else:
         # There is a retry lined up
         delay_msg = "retrying in %s" % (
             itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str())
         msg = "failed, %s" % (delay_msg)
         LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask)
         itask.summary['latest_message'] = msg
         if itask.state.reset_state(TASK_STATUS_RETRYING):
             self.setup_event_handlers(
                 itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg))
     self._reset_job_timers(itask)
Example #2
0
 def _process_message_succeeded(self, itask, event_time):
     """Helper for process_message, handle a succeeded message."""
     self.pflag = True
     itask.set_summary_time('finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 0,
         "time_run_exit": event_time,
     })
     # Update mean elapsed time only on task succeeded.
     if itask.summary['started_time'] is not None:
         itask.tdef.elapsed_times.append(
             itask.summary['finished_time'] -
             itask.summary['started_time'])
     if not itask.state.outputs.all_completed():
         msg = ""
         for output in itask.state.outputs.get_not_completed():
             if output not in [TASK_OUTPUT_EXPIRED,
                               TASK_OUTPUT_SUBMIT_FAILED,
                               TASK_OUTPUT_FAILED]:
                 msg += "\n  " + output
         if msg:
             LOG.info("Succeeded with outputs not completed: %s" % msg,
                      itask=itask)
     if itask.state.reset_state(TASK_STATUS_SUCCEEDED):
         self.setup_event_handlers(itask, "succeeded", "job succeeded")
     self._reset_job_timers(itask)
Example #3
0
    def report(self, request, server_obj):
        """Log client requests with identifying information.

        In debug mode log all requests including task messages. Otherwise log
        all user commands, and just the first info request from each client.

        """
        if threading.current_thread().__class__.__name__ == '_MainThread':
            # Server methods may be called internally as well as by clients.
            return
        auth_user, prog_name, user, host, uuid, priv_level = get_client_info()
        name = server_obj.__class__.__name__
        log_me = (
            cylc.flags.debug or
            name in ["SuiteCommandServer",
                     "ExtTriggerServer",
                     "BroadcastServer"] or
            (name not in ["SuiteIdServer", "TaskMessageServer"] and
             uuid not in self.clients))
        if log_me:
            LOG.debug(
                self.__class__.LOG_CONNECT_ALLOWED_TMPL % (
                    user, host, prog_name, priv_level, uuid)
            )
            LOG.info(
                self.__class__.LOG_COMMAND_TMPL % (
                    request, user, host, prog_name, uuid))
        if name == "SuiteIdServer":
            self._num_id_requests += 1
            self.report_id_requests()
        self.clients[uuid] = datetime.datetime.utcnow()
        self._housekeep()
    def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None):
        """Poll jobs of specified tasks.

        Any job that is or was submitted or running can be polled, except for
        retrying tasks - which would poll (correctly) as failed. And don't poll
        succeeded tasks by default.

        This method uses _poll_task_jobs_callback() and
        _manip_task_jobs_callback() as help/callback methods.

        _poll_task_job_callback() executes one specific job.
        """
        poll_me = []
        pollable = [
            TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED
        ]
        for itask in itasks:
            if itask.state.status in pollable or (
                    itask.state.status == TASK_STATUS_SUCCEEDED and poll_succ):
                poll_me.append(itask)
            else:
                LOG.debug("skipping %s: not pollable, "
                          "or skipping 'succeeded' tasks" % itask.identity)
        if poll_me:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(self.JOBS_POLL, suite, poll_me,
                              self._poll_task_jobs_callback)
Example #5
0
 def _process_message_submit_failed(self, itask, event_time):
     """Helper for process_message, handle a submit-failed message."""
     LOG.error(self.EVENT_SUBMIT_FAILED, itask=itask)
     if event_time is None:
         event_time = get_current_time_string()
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "time_submit_exit": get_current_time_string(),
         "submit_status": 1,
     })
     itask.summary['submit_method_id'] = None
     if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or
             itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None):
         # No submission retry lined up: definitive failure.
         self.pflag = True
         # See github #476.
         if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED):
             self.setup_event_handlers(
                 itask, self.EVENT_SUBMIT_FAILED,
                 'job %s' % self.EVENT_SUBMIT_FAILED)
     else:
         # There is a submission retry lined up.
         timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING]
         delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str()
         msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)
         LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask)
         itask.summary['latest_message'] = msg
         if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING):
             self.setup_event_handlers(
                 itask, self.EVENT_SUBMIT_RETRY,
                 "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg))
     self._reset_job_timers(itask)
Example #6
0
    def report(self, request, server_obj):
        """Log client requests with identifying information.

        In debug mode log all requests including task messages. Otherwise log
        all user commands, and just the first info request from each client.

        """
        if threading.current_thread().__class__.__name__ == '_MainThread':
            # Server methods may be called internally as well as by clients.
            return
        auth_user, prog_name, user, host, uuid, priv_level = get_client_info()
        name = server_obj.__class__.__name__
        log_me = (
            cylc.flags.debug or
            name in ["SuiteCommandServer",
                     "ExtTriggerServer",
                     "BroadcastServer"] or
            (name not in ["SuiteIdServer", "TaskMessageServer"] and
             uuid not in self.clients))
        if log_me:
            LOG.debug(
                self.__class__.LOG_CONNECT_ALLOWED_TMPL % (
                    user, host, prog_name, priv_level, uuid)
            )
            LOG.info(
                self.__class__.LOG_COMMAND_TMPL % (
                    request, user, host, prog_name, uuid))
        if name == "SuiteIdServer":
            self._num_id_requests += 1
            self.report_id_requests()
        self.clients[uuid] = datetime.datetime.utcnow()
        self._housekeep()
Example #7
0
    def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None):
        """Poll jobs of specified tasks.

        Any job that is or was submitted or running can be polled, except for
        retrying tasks - which would poll (correctly) as failed. And don't poll
        succeeded tasks by default.

        This method uses _poll_task_jobs_callback() and
        _manip_task_jobs_callback() as help/callback methods.

        _poll_task_job_callback() executes one specific job.
        """
        to_poll_tasks = []
        pollable_statuses = set([
            TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED])
        if poll_succ:
            pollable_statuses.add(TASK_STATUS_SUCCEEDED)
        for itask in itasks:
            if itask.state.status in pollable_statuses:
                to_poll_tasks.append(itask)
            else:
                LOG.debug("skipping %s: not pollable, "
                          "or skipping 'succeeded' tasks" % itask.identity)
        if to_poll_tasks:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(
                self.JOBS_POLL, suite, to_poll_tasks,
                self._poll_task_jobs_callback)
Example #8
0
 def _process_message_failed(self, itask, event_time, message):
     """Helper for process_message, handle a failed message."""
     if event_time is None:
         event_time = get_current_time_string()
     itask.set_summary_time('finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 1,
         "time_run_exit": event_time,
     })
     if (TASK_STATUS_RETRYING not in itask.try_timers
             or itask.try_timers[TASK_STATUS_RETRYING].next() is None):
         # No retry lined up: definitive failure.
         self.pflag = True
         if itask.state.reset_state(TASK_STATUS_FAILED):
             self.setup_event_handlers(itask, "failed", message)
         LOG.critical("job(%02d) %s" % (itask.submit_num, "failed"),
                      itask=itask)
     else:
         # There is a retry lined up
         delay_msg = "retrying in %s" % (
             itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str())
         msg = "failed, %s" % (delay_msg)
         LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask)
         itask.summary['latest_message'] = msg
         if itask.state.reset_state(TASK_STATUS_RETRYING):
             self.setup_event_handlers(
                 itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg))
     self._reset_job_timers(itask)
Example #9
0
 def _process_message_succeeded(self, itask, event_time):
     """Helper for process_message, handle a succeeded message."""
     self.pflag = True
     itask.set_summary_time('finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 0,
         "time_run_exit": event_time,
     })
     # Update mean elapsed time only on task succeeded.
     if itask.summary['started_time'] is not None:
         itask.tdef.elapsed_times.append(itask.summary['finished_time'] -
                                         itask.summary['started_time'])
     if not itask.state.outputs.all_completed():
         msg = ""
         for output in itask.state.outputs.get_not_completed():
             if output not in [
                     TASK_OUTPUT_EXPIRED, TASK_OUTPUT_SUBMIT_FAILED,
                     TASK_OUTPUT_FAILED
             ]:
                 msg += "\n  " + output
         if msg:
             LOG.info("Succeeded with outputs not completed: %s" % msg,
                      itask=itask)
     if itask.state.reset_state(TASK_STATUS_SUCCEEDED):
         self.setup_event_handlers(itask, "succeeded", "job succeeded")
     self._reset_job_timers(itask)
Example #10
0
 def _process_message_submit_failed(self, itask, event_time):
     """Helper for process_message, handle a submit-failed message."""
     LOG.error(self.EVENT_SUBMIT_FAILED, itask=itask)
     if event_time is None:
         event_time = get_current_time_string()
     self.suite_db_mgr.put_update_task_jobs(
         itask, {
             "time_submit_exit": get_current_time_string(),
             "submit_status": 1,
         })
     itask.summary['submit_method_id'] = None
     if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or
             itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None):
         # No submission retry lined up: definitive failure.
         self.pflag = True
         # See github #476.
         if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED):
             self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED,
                                       'job %s' % self.EVENT_SUBMIT_FAILED)
     else:
         # There is a submission retry lined up.
         timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING]
         delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str()
         msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)
         LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask)
         itask.summary['latest_message'] = msg
         if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING):
             self.setup_event_handlers(
                 itask, self.EVENT_SUBMIT_RETRY,
                 "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg))
     self._reset_job_timers(itask)
Example #11
0
 def load_db_broadcast_states(self, row_idx, row):
     """Load broadcast variables from runtime DB broadcast states row."""
     if row_idx == 0:
         LOG.info("LOADING broadcast states")
     point, namespace, key, value = row
     sections = []
     cur_key = key
     if "]" in cur_key:
         sections = self.REC_SECTION.findall(cur_key)
         cur_key = cur_key.rsplit(r"]", 1)[-1]
     with self.lock:
         self.broadcasts.setdefault(point, {})
         self.broadcasts[point].setdefault(namespace, {})
         dict_ = self.broadcasts[point][namespace]
         for section in sections:
             dict_.setdefault(section, {})
             dict_ = dict_[section]
         dict_[cur_key] = value
     LOG.info(
         CHANGE_FMT.strip() % {
             "change": CHANGE_PREFIX_SET,
             "point": point,
             "namespace": namespace,
             "key": key,
             "value": value
         })
Example #12
0
    def _check_access_priv_and_report(self,
                                      required_privilege_level,
                                      log_info=True):
        """Check access privilege and log requests with identifying info.

        In debug mode log all requests including task messages. Otherwise log
        all user commands, and just the first info command from each client.

        Return:
            dict: containing the client session

        """
        self._check_access_priv(required_privilege_level)
        command = inspect.currentframe().f_back.f_code.co_name
        auth_user, prog_name, user, host, uuid = _get_client_info()
        priv_level = self._get_priv_level(auth_user)
        LOG.debug(self.__class__.LOG_CONNECT_ALLOWED_TMPL %
                  (user, host, prog_name, priv_level, uuid))
        if cylc.flags.debug or uuid not in self.clients and log_info:
            LOG.info(self.__class__.LOG_COMMAND_TMPL %
                     (command, user, host, prog_name, uuid))
        self.clients.setdefault(uuid, {})
        self.clients[uuid]['time'] = time()
        self._housekeep()
        return self.clients[uuid]
Example #13
0
    def clear_broadcast(self,
                        point_strings=None,
                        namespaces=None,
                        cancel_settings=None):
        """Clear broadcasts globally, or for listed namespaces and/or points.

        Return a tuple (modified_settings, bad_options), where:
        * modified_settings is similar to the return value of the "put" method,
          but for removed broadcasts.
        * bad_options is a dict in the form:
              {"point_strings": ["20020202", ..."], ...}
          The dict is only populated if there are options not associated with
          previous broadcasts. The keys can be:
          * point_strings: a list of bad point strings.
          * namespaces: a list of bad namespaces.
          * cancel: a list of tuples. Each tuple contains the keys of a bad
            setting.
        """
        # If cancel_settings defined, only clear specific broadcasts
        cancel_keys_list = self._settings_to_keys_list(cancel_settings)

        # Clear broadcasts
        modified_settings = []
        with self.lock:
            for point_string, point_string_settings in self.broadcasts.items():
                if point_strings and point_string not in point_strings:
                    continue
                for namespace, namespace_settings in (
                        point_string_settings.items()):
                    if namespaces and namespace not in namespaces:
                        continue
                    stuff_stack = [([], namespace_settings)]
                    while stuff_stack:
                        keys, stuff = stuff_stack.pop()
                        for key, value in stuff.items():
                            if isinstance(value, dict):
                                stuff_stack.append((keys + [key], value))
                            elif (not cancel_keys_list
                                  or keys + [key] in cancel_keys_list):
                                stuff[key] = None
                                setting = {key: value}
                                for rkey in reversed(keys):
                                    setting = {rkey: setting}
                                modified_settings.append(
                                    (point_string, namespace, setting))

        # Prune any empty branches
        bad_options = self._get_bad_options(self._prune(), point_strings,
                                            namespaces, cancel_keys_list)

        # Log the broadcast
        self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True)
        LOG.info(get_broadcast_change_report(modified_settings,
                                             is_cancel=True))
        if bad_options:
            LOG.error(get_broadcast_bad_options_report(bad_options))

        return (modified_settings, bad_options)
Example #14
0
 def _run_event_handlers_callback(self, proc_ctx, abort_on_error=False):
     """Callback on completion of a suite event handler."""
     if proc_ctx.ret_code:
         msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1]
         LOG.error(str(proc_ctx))
         ERR.error(msg)
         if abort_on_error:
             raise SuiteEventError(msg)
     else:
         LOG.info(str(proc_ctx))
Example #15
0
 def _run_event_handlers_callback(proc_ctx, abort_on_error=False):
     """Callback on completion of a suite event handler."""
     if proc_ctx.ret_code:
         msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1]
         LOG.error(str(proc_ctx))
         LOG.error(msg)
         if abort_on_error:
             raise SuiteEventError(msg)
     else:
         LOG.info(str(proc_ctx))
Example #16
0
 def satisfy_xclock(self, itask):
     """Attempt to satisfy itask's clock trigger, if it has one."""
     label, sig, ctx, satisfied = self._get_xclock(itask)
     if satisfied:
         return
     if wall_clock(*ctx.func_args, **ctx.func_kwargs):
         satisfied = True
         itask.state.xclock = (label, True)
         self.sat_xclock.append(sig)
         LOG.info('clock xtrigger satisfied: %s = %s' % (label, str(ctx)))
Example #17
0
    def signout(self, server_obj):
        """Force forget this client (for use by GUI etc.)."""

        caller = server_obj.getLocalStorage().caller
        LOG.info(
            self.__class__.LOG_SIGNOUT_TMPL % (
                caller.user, caller.host, caller.prog_name, caller.uuid))
        try:
            del self.clients[caller.uuid]
        except KeyError:
            # Already forgotten.
            pass
        self._housekeep()
Example #18
0
    def signout(self, server_obj):
        """Force forget this client (for use by GUI etc.)."""

        caller = server_obj.getLocalStorage().caller
        LOG.info(
            self.__class__.LOG_SIGNOUT_TMPL % (
                caller.user, caller.host, caller.prog_name, caller.uuid))
        try:
            del self.clients[caller.uuid]
        except:
            # Already forgotten.
            pass
        self._housekeep()
Example #19
0
    def put_broadcast(self,
                      point_strings=None,
                      namespaces=None,
                      settings=None):
        """Add new broadcast settings (server side interface).

        Return a tuple (modified_settings, bad_options) where:
          modified_settings is list of modified settings in the form:
            [("20200202", "foo", {"command scripting": "true"}, ...]
          bad_options is as described in the docstring for self.clear().
        """
        modified_settings = []
        bad_point_strings = []
        bad_namespaces = []

        with self.lock:
            for setting in settings:
                for point_string in point_strings:
                    # Standardise the point and check its validity.
                    bad_point = False
                    try:
                        point_string = standardise_point_string(point_string)
                    except PointParsingError:
                        if point_string != '*':
                            bad_point_strings.append(point_string)
                            bad_point = True
                    if not bad_point and point_string not in self.broadcasts:
                        self.broadcasts[point_string] = {}
                    for namespace in namespaces:
                        if namespace not in self.linearized_ancestors:
                            bad_namespaces.append(namespace)
                        elif not bad_point:
                            if namespace not in self.broadcasts[point_string]:
                                self.broadcasts[point_string][namespace] = {}
                            self._addict(
                                self.broadcasts[point_string][namespace],
                                setting)
                            modified_settings.append(
                                (point_string, namespace, setting))

        # Log the broadcast
        self.suite_db_mgr.put_broadcast(modified_settings)
        LOG.info(get_broadcast_change_report(modified_settings))

        bad_options = {}
        if bad_point_strings:
            bad_options["point_strings"] = bad_point_strings
        if bad_namespaces:
            bad_options["namespaces"] = bad_namespaces
        return modified_settings, bad_options
Example #20
0
    def check_task_jobs(self, suite, task_pool):
        """Check submission and execution timeout and polling timers.

        Poll tasks that have timed out and/or have reached next polling time.
        """
        now = time()
        poll_tasks = set()
        for itask in task_pool.get_tasks():
            if self.task_events_mgr.check_job_time(itask, now):
                poll_tasks.add(itask)
                if itask.poll_timer.delay is not None:
                    LOG.info('poll now, (next in %s)' %
                             (itask.poll_timer.delay_timeout_as_str()),
                             itask=itask)
        if poll_tasks:
            self.poll_task_jobs(suite, poll_tasks)
Example #21
0
    def check_task_jobs(self, suite, task_pool):
        """Check submission and execution timeout and polling timers.

        Poll tasks that have timed out and/or have reached next polling time.
        """
        now = time()
        poll_tasks = set()
        for itask in task_pool.get_tasks():
            if self.task_events_mgr.check_job_time(itask, now):
                poll_tasks.add(itask)
                if itask.poll_timer.delay is not None:
                    LOG.info(
                        'poll now, (next in %s)' % (
                            itask.poll_timer.delay_timeout_as_str()),
                        itask=itask)
        if poll_tasks:
            self.poll_task_jobs(suite, poll_tasks)
Example #22
0
 def report_id_requests(self):
     """Report the frequency of identification (scan) requests."""
     current_time = time.time()
     interval = current_time - self._id_start_time
     if interval > self.CLIENT_ID_REPORT_SECONDS:
         rate = float(self._num_id_requests) / interval
         if rate > self.CLIENT_ID_MIN_REPORT_RATE:
             LOG.warning(
                 self.__class__.LOG_IDENTIFY_TMPL % (
                     self._num_id_requests, interval)
             )
         elif cylc.flags.debug:
             LOG.info(
                 self.__class__.LOG_IDENTIFY_TMPL % (
                     self._num_id_requests, interval)
             )
         self._id_start_time = current_time
         self._num_id_requests = 0
Example #23
0
 def report_id_requests(self):
     """Report the frequency of identification (scan) requests."""
     current_time = time.time()
     interval = current_time - self._id_start_time
     if interval > self.CLIENT_ID_REPORT_SECONDS:
         rate = float(self._num_id_requests) / interval
         if rate > self.CLIENT_ID_MIN_REPORT_RATE:
             LOG.warning(
                 self.__class__.LOG_IDENTIFY_TMPL % (
                     self._num_id_requests, interval)
             )
         elif cylc.flags.debug:
             LOG.info(
                 self.__class__.LOG_IDENTIFY_TMPL % (
                     self._num_id_requests, interval)
             )
         self._id_start_time = current_time
         self._num_id_requests = 0
Example #24
0
    def _process_message_submitted(self, itask, event_time):
        """Helper for process_message, handle a submit-succeeded message."""
        try:
            LOG.info(
                ('job[%(submit_num)02d] submitted to'
                 ' %(host)s:%(batch_sys_name)s[%(submit_method_id)s]') %
                itask.summary,
                itask=itask)
        except KeyError:
            pass
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "time_submit_exit": event_time,
            "submit_status": 0,
            "batch_sys_job_id": itask.summary.get('submit_method_id')})

        if itask.tdef.run_mode == 'simulation':
            # Simulate job execution at this point.
            itask.set_summary_time('submitted', event_time)
            itask.set_summary_time('started', event_time)
            itask.state.reset_state(TASK_STATUS_RUNNING)
            itask.state.outputs.set_completion(TASK_OUTPUT_STARTED, True)
            return

        itask.set_summary_time('submitted', event_time)
        # Unset started and finished times in case of resubmission.
        itask.set_summary_time('started')
        itask.set_summary_time('finished')
        itask.summary['latest_message'] = TASK_OUTPUT_SUBMITTED
        self.setup_event_handlers(
            itask, TASK_OUTPUT_SUBMITTED, 'job submitted')

        self.pflag = True
        if itask.state.status == TASK_STATUS_READY:
            # The job started message can (rarely) come in before the submit
            # command returns - in which case do not go back to 'submitted'.
            itask.state.reset_state(TASK_STATUS_SUBMITTED)
            try:
                itask.timeout_timers[TASK_STATUS_SUBMITTED] = (
                    itask.summary['submitted_time'] +
                    float(self._get_events_conf(itask, 'submission timeout')))
            except (TypeError, ValueError):
                itask.timeout_timers[TASK_STATUS_SUBMITTED] = None
            self.set_poll_time(itask)
Example #25
0
    def _process_message_submitted(self, itask, event_time):
        """Helper for process_message, handle a submit-succeeded message."""
        try:
            LOG.info(('job[%(submit_num)02d] submitted to'
                      ' %(host)s:%(batch_sys_name)s[%(submit_method_id)s]') %
                     itask.summary,
                     itask=itask)
        except KeyError:
            pass
        self.suite_db_mgr.put_update_task_jobs(
            itask, {
                "time_submit_exit": event_time,
                "submit_status": 0,
                "batch_sys_job_id": itask.summary.get('submit_method_id')
            })

        if itask.tdef.run_mode == 'simulation':
            # Simulate job execution at this point.
            itask.set_event_time('started', event_time)
            itask.state.reset_state(TASK_STATUS_RUNNING)
            itask.state.outputs.set_completion(TASK_OUTPUT_STARTED, True)
            return

        itask.set_event_time('submitted', event_time)
        itask.set_event_time('started')
        itask.set_event_time('finished')
        itask.summary['latest_message'] = TASK_OUTPUT_SUBMITTED
        self.setup_event_handlers(itask, TASK_OUTPUT_SUBMITTED,
                                  'job submitted')

        self.pflag = True
        if itask.state.status == TASK_STATUS_READY:
            # In rare occassions, the submit command of a batch system has sent
            # the job to its server, and the server has started the job before
            # the job submit command returns.
            itask.state.reset_state(TASK_STATUS_SUBMITTED)
            try:
                itask.timeout_timers[TASK_STATUS_SUBMITTED] = (
                    itask.summary['submitted_time'] +
                    float(self._get_events_conf(itask, 'submission timeout')))
            except (TypeError, ValueError):
                itask.timeout_timers[TASK_STATUS_SUBMITTED] = None
            self.set_poll_time(itask)
Example #26
0
 def _process_message_succeeded(self, itask, event_time):
     """Helper for process_message, handle a succeeded message."""
     self.pflag = True
     itask.set_event_time('finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 0,
         "time_run_exit": event_time,
     })
     # Update mean elapsed time only on task succeeded.
     if itask.summary['started_time'] is not None:
         itask.tdef.elapsed_times.append(itask.summary['finished_time'] -
                                         itask.summary['started_time'])
     if not itask.state.outputs.all_completed():
         message = "Succeeded with unreported outputs:"
         for output in itask.state.outputs.get_not_completed():
             message += "\n  " + output
         LOG.info(message, itask=itask)
     itask.state.reset_state(TASK_STATUS_SUCCEEDED)
     self.setup_event_handlers(itask, "succeeded", "job succeeded")
Example #27
0
    def set_poll_time(itask, now=None):
        """Set the next task execution/submission poll time.

        If now is set, set the timer only if the previous delay is done.
        Return the next delay.
        """
        key = itask.state.status
        timer = itask.poll_timers.get(key)
        if timer is None:
            return
        if now is not None and not timer.is_delay_done(now):
            return
        if timer.num is None:
            timer.num = 0
        delay = timer.next(no_exhaust=True)
        if delay is not None:
            LOG.info('next job poll in %s (after %s)' %
                     (timer.delay_as_seconds(), timer.timeout_as_str()),
                     itask=itask)
        return delay
Example #28
0
    def set_poll_time(itask, now=None):
        """Set the next task execution/submission poll time.

        If now is set, set the timer only if the previous delay is done.
        Return the next delay.
        """
        key = itask.state.status
        timer = itask.poll_timers.get(key)
        if timer is None:
            return
        if now is not None and not timer.is_delay_done(now):
            return
        if timer.num is None:
            timer.num = 0
        delay = timer.next(no_exhaust=True)
        if delay is not None:
            LOG.info(
                'next job poll in %s (after %s)' % (
                    timer.delay_as_seconds(), timer.timeout_as_str()),
                itask=itask)
        return delay
Example #29
0
 def print_result(result):
     """Print result"""
     if result['OUT']:
         LOG.info('result> ' + result['OUT'].strip())
     if result['ERR']:
         LOG.info('FAILED> ' + result['CMD'])
         LOG.info(result['ERR'].strip())
Example #30
0
 def print_result(result):
     """Print result"""
     if result['OUT']:
         LOG.info('result> ' + result['OUT'].strip())
     if result['ERR']:
         LOG.info('FAILED> ' + result['CMD'])
         LOG.info(result['ERR'].strip())
Example #31
0
def log_task_job_activity(ctx, suite, point, name, submit_num=None):
    """Log an activity for a task job."""
    ctx_str = str(ctx)
    if not ctx_str:
        return
    if isinstance(ctx.cmd_key, tuple):  # An event handler
        submit_num = ctx.cmd_key[-1]
    job_activity_log = get_task_job_activity_log(suite, point, name,
                                                 submit_num)
    try:
        with open(job_activity_log, "ab") as handle:
            handle.write(ctx_str + '\n')
    except IOError as exc:
        # This happens when there is no job directory, e.g. if job host
        # selection command causes an submission failure, there will be no job
        # directory. In this case, just send the information to the suite log.
        LOG.debug(exc)
        LOG.info(ctx_str)
    if ctx.cmd and ctx.ret_code:
        LOG.error(ctx_str)
    elif ctx.cmd:
        LOG.debug(ctx_str)
Example #32
0
def log_task_job_activity(ctx, suite, point, name, submit_num=None):
    """Log an activity for a task job."""
    ctx_str = str(ctx)
    if not ctx_str:
        return
    if isinstance(ctx.cmd_key, tuple):  # An event handler
        submit_num = ctx.cmd_key[-1]
    job_activity_log = get_task_job_activity_log(
        suite, point, name, submit_num)
    try:
        with open(job_activity_log, "ab") as handle:
            handle.write(ctx_str + '\n')
    except IOError as exc:
        # This happens when there is no job directory, e.g. if job host
        # selection command causes an submission failure, there will be no job
        # directory. In this case, just send the information to the suite log.
        LOG.debug(exc)
        LOG.info(ctx_str)
    if ctx.cmd and ctx.ret_code:
        LOG.error(ctx_str)
    elif ctx.cmd:
        LOG.debug(ctx_str)
Example #33
0
    def _process_message_submitted(self, itask, event_time):
        """Helper for process_message, handle a submit-succeeded message."""
        try:
            LOG.info(('job[%(submit_num)02d] submitted to'
                      ' %(host)s:%(batch_sys_name)s[%(submit_method_id)s]') %
                     itask.summary,
                     itask=itask)
        except KeyError:
            pass
        self.suite_db_mgr.put_update_task_jobs(
            itask, {
                "time_submit_exit": event_time,
                "submit_status": 0,
                "batch_sys_job_id": itask.summary.get('submit_method_id')
            })

        if itask.tdef.run_mode == 'simulation':
            # Simulate job execution at this point.
            itask.set_summary_time('submitted', event_time)
            itask.set_summary_time('started', event_time)
            itask.state.reset_state(TASK_STATUS_RUNNING)
            itask.state.outputs.set_completion(TASK_OUTPUT_STARTED, True)
            return

        itask.set_summary_time('submitted', event_time)
        # Unset started and finished times in case of resubmission.
        itask.set_summary_time('started')
        itask.set_summary_time('finished')
        itask.summary['latest_message'] = TASK_OUTPUT_SUBMITTED

        self.pflag = True
        if itask.state.status == TASK_STATUS_READY:
            # The job started message can (rarely) come in before the submit
            # command returns - in which case do not go back to 'submitted'.
            if itask.state.reset_state(TASK_STATUS_SUBMITTED):
                self.setup_event_handlers(itask, TASK_OUTPUT_SUBMITTED,
                                          'job submitted')
            self._reset_job_timers(itask)
Example #34
0
    def _check_access_priv_and_report(
            self, required_privilege_level, log_info=True):
        """Check access privilege and log requests with identifying info.

        In debug mode log all requests including task messages. Otherwise log
        all user commands, and just the first info command from each client.

        Return:
            dict: containing the client session

        """
        self._check_access_priv(required_privilege_level)
        command = inspect.currentframe().f_back.f_code.co_name
        auth_user, prog_name, user, host, uuid = _get_client_info()
        priv_level = self._get_priv_level(auth_user)
        LOG.debug(self.__class__.LOG_CONNECT_ALLOWED_TMPL % (
            user, host, prog_name, priv_level, uuid))
        if cylc.flags.debug or uuid not in self.clients and log_info:
            LOG.info(self.__class__.LOG_COMMAND_TMPL % (
                command, user, host, prog_name, uuid))
        self.clients.setdefault(uuid, {})
        self.clients[uuid]['time'] = time()
        self._housekeep()
        return self.clients[uuid]
Example #35
0
def main():
    """Manual test playground."""

    log = logging.getLogger(LOG)
    log.setLevel(logging.INFO)  # or logging.DEBUG
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.DEBUG)
    log.addHandler(handler)

    def print_result(result):
        """Print result"""
        if result['OUT']:
            LOG.info('result> ' + result['OUT'].strip())
        if result['ERR']:
            LOG.info('FAILED> ' + result['CMD'])
            LOG.info(result['ERR'].strip())

    pool = SuiteProcPool(3)

    for i in range(3):
        com = "sleep 5 && echo Hello from JOB " + str(i)
        pool.put_command(SuiteProcPool.JOBS_SUBMIT, com, print_result)
        com = "sleep 5 && echo Hello from POLL " + str(i)
        pool.put_command("poll", com, print_result)
        com = "sleep 5 && echo Hello from HANDLER " + str(i)
        pool.put_command("event-handler", com, print_result)
        com = "sleep 5 && echo Hello from HANDLER && badcommand"
        pool.put_command("event-handler", com, print_result)

    LOG.info('  sleeping')
    time.sleep(3)
    pool.handle_results_async()
    LOG.info('  sleeping')
    time.sleep(3)
    pool.close()
    # pool.terminate()
    pool.handle_results_async()
    LOG.info('  sleeping')
    time.sleep(3)
    pool.join()
    pool.handle_results_async()
Example #36
0
def main():
    """Manual test playground."""

    log = logging.getLogger(LOG)
    log.setLevel(logging.INFO)  # or logging.DEBUG
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.DEBUG)
    log.addHandler(handler)

    def print_result(result):
        """Print result"""
        if result['OUT']:
            LOG.info('result> ' + result['OUT'].strip())
        if result['ERR']:
            LOG.info('FAILED> ' + result['CMD'])
            LOG.info(result['ERR'].strip())

    pool = SuiteProcPool(3)

    for i in range(3):
        com = "sleep 5 && echo Hello from JOB " + str(i)
        pool.put_command(SuiteProcPool.JOBS_SUBMIT, com, print_result)
        com = "sleep 5 && echo Hello from POLL " + str(i)
        pool.put_command("poll", com, print_result)
        com = "sleep 5 && echo Hello from HANDLER " + str(i)
        pool.put_command("event-handler", com, print_result)
        com = "sleep 5 && echo Hello from HANDLER && badcommand"
        pool.put_command("event-handler", com, print_result)

    LOG.info('  sleeping')
    time.sleep(3)
    pool.handle_results_async()
    LOG.info('  sleeping')
    time.sleep(3)
    pool.close()
    # pool.terminate()
    pool.handle_results_async()
    LOG.info('  sleeping')
    time.sleep(3)
    pool.join()
    pool.handle_results_async()
Example #37
0
    def clear(self, point_strings=None, namespaces=None, cancel_settings=None):
        """Clear settings globally, or for listed namespaces and/or points.

        Return a tuple (modified_settings, bad_options), where:
        * modified_settings is similar to the return value of the "put" method,
          but for removed settings.
        * bad_options is a dict in the form:
              {"point_strings": ["20020202", ..."], ...}
          The dict is only populated if there are options not associated with
          previous broadcasts. The keys can be:
          * point_strings: a list of bad point strings.
          * namespaces: a list of bad namespaces.
          * cancel: a list of tuples. Each tuple contains the keys of a bad
            setting.
        """

        if hasattr(cherrypy.request, "json"):
            point_strings = (
                cherrypy.request.json.get("point_strings", point_strings))
            namespaces = (
                cherrypy.request.json.get("namespaces", namespaces))
            cancel_settings = (
                cherrypy.request.json.get("cancel_settings", cancel_settings))
            point_strings = unicode_encode(point_strings)
            namespaces = unicode_encode(namespaces)
            cancel_settings = unicode_encode(cancel_settings)
        # If cancel_settings defined, only clear specific settings
        cancel_keys_list = self._settings_to_keys_list(cancel_settings)

        # Clear settings
        modified_settings = []
        with self.lock:
            for point_string, point_string_settings in self.settings.items():
                if point_strings and point_string not in point_strings:
                    continue
                for namespace, namespace_settings in (
                        point_string_settings.items()):
                    if namespaces and namespace not in namespaces:
                        continue
                    stuff_stack = [([], namespace_settings)]
                    while stuff_stack:
                        keys, stuff = stuff_stack.pop()
                        for key, value in stuff.items():
                            if isinstance(value, dict):
                                stuff_stack.append((keys + [key], value))
                            elif (not cancel_keys_list or
                                    keys + [key] in cancel_keys_list):
                                stuff[key] = None
                                setting = {key: value}
                                for rkey in reversed(keys):
                                    setting = {rkey: setting}
                                modified_settings.append(
                                    (point_string, namespace, setting))

        # Prune any empty branches
        bad_options = self._get_bad_options(
            self._prune(), point_strings, namespaces, cancel_keys_list)

        # Log the broadcast
        self._append_db_queue(modified_settings, is_cancel=True)
        LOG.info(
            get_broadcast_change_report(modified_settings, is_cancel=True))
        if bad_options:
            LOG.error(get_broadcast_bad_options_report(bad_options))

        return (modified_settings, bad_options)
Example #38
0
    def put(self, point_strings=None, namespaces=None, settings=None,
            not_from_client=False):
        """Add new broadcast settings (server side interface).

        Return a tuple (modified_settings, bad_options) where:
          modified_settings is list of modified settings in the form:
            [("20200202", "foo", {"command scripting": "true"}, ...]
          bad_options is as described in the docstring for self.clear().
        """
        check_access_priv(self, 'full-control')
        self.report('broadcast_put')
        if not not_from_client:
            point_strings = (
                cherrypy.request.json.get("point_strings", point_strings))
            namespaces = (
                cherrypy.request.json.get("namespaces", namespaces))
            settings = (
                cherrypy.request.json.get("settings", settings))
            point_strings = unicode_encode(point_strings)
            namespaces = unicode_encode(namespaces)
            settings = unicode_encode(settings)

        modified_settings = []
        bad_point_strings = []
        bad_namespaces = []

        with self.lock:
            for setting in settings:
                for point_string in point_strings:
                    # Standardise the point and check its validity.
                    bad_point = False
                    try:
                        point_string = standardise_point_string(point_string)
                    except Exception:
                        if point_string != '*':
                            bad_point_strings.append(point_string)
                            bad_point = True
                    if not bad_point and point_string not in self.settings:
                        self.settings[point_string] = {}
                    for namespace in namespaces:
                        if namespace not in self.linearized_ancestors:
                            bad_namespaces.append(namespace)
                        elif not bad_point:
                            if namespace not in self.settings[point_string]:
                                self.settings[point_string][namespace] = {}
                            self._addict(
                                self.settings[point_string][namespace],
                                setting)
                            modified_settings.append(
                                (point_string, namespace, setting))

        # Log the broadcast
        self._append_db_queue(modified_settings)
        LOG.info(get_broadcast_change_report(modified_settings))

        bad_options = {}
        if bad_point_strings:
            bad_options["point_strings"] = bad_point_strings
        if bad_namespaces:
            bad_options["namespaces"] = bad_namespaces
        return modified_settings, bad_options
    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)
        if not prepared_tasks:
            return bad_tasks

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.summary['latest_message'] = self.REMOTE_INIT_MSG
                continue
            # Persist
            if owner:
                owner_at_host = owner + '@' + host
            else:
                owner_at_host = host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and perist
                LOG.info('submit-num=%d, owner@host=%s' %
                         (itask.submit_num, owner_at_host),
                         itask=itask)
                self.suite_db_mgr.put_insert_task_jobs(
                    itask, {
                        'is_manual_submit': itask.is_manual_submit,
                        'try_num': itask.get_try_num(),
                        'time_submit': now_str,
                        'user_at_host': owner_at_host,
                        'batch_sys_name': itask.summary['batch_sys_name'],
                    })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    self.task_events_mgr.log_task_job_activity(
                        SuiteProcContext(self.JOBS_SUBMIT,
                                         '(init %s)' % owner_at_host,
                                         err=REMOTE_INIT_FAILED,
                                         ret_code=1), suite, itask.point,
                        itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED,
                        self.poll_task_jobs)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if cylc.flags.debug:
                cmd.append('--debug')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [('host', host, is_remote_host),
                                          ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(
                GLOBAL_CFG.get_derived_host_item(suite,
                                                 'suite job log directory',
                                                 host, owner))
            stdin_file_paths = []
            job_log_dirs = []
            for itask in sorted(itasks, key=lambda itask: itask.identity):
                if remote_mode:
                    stdin_file_paths.append(
                        self.task_events_mgr.get_task_job_log(
                            suite, itask.point, itask.tdef.name,
                            itask.submit_num, self.JOB_FILE_BASE))
                job_log_dirs.append(
                    self.task_events_mgr.get_task_job_id(
                        itask.point, itask.tdef.name, itask.submit_num))
                # The job file is now (about to be) used: reset the file write
                # flag so that subsequent manual retrigger will generate a new
                # job file.
                itask.local_job_file_path = None
                itask.state.reset_state(TASK_STATUS_READY)
                if itask.state.outputs.has_custom_triggers():
                    self.suite_db_mgr.put_update_task_outputs(itask)
            cmd += job_log_dirs
            self.proc_pool.put_command(
                SuiteProcContext(self.JOBS_SUBMIT,
                                 cmd,
                                 stdin_file_paths=stdin_file_paths,
                                 job_log_dirs=job_log_dirs,
                                 **kwargs), self._submit_task_jobs_callback,
                [suite, itasks])
        return done_tasks
Example #40
0
    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.summary['latest_message'] = self.REMOTE_INIT_MSG
                continue
            # Persist
            if owner:
                owner_at_host = owner + '@' + host
            else:
                owner_at_host = host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and perist
                LOG.info(
                    'submit-num=%d, owner@host=%s' % (
                        itask.submit_num, owner_at_host),
                    itask=itask)
                self.suite_db_mgr.put_insert_task_jobs(itask, {
                    'is_manual_submit': itask.is_manual_submit,
                    'try_num': itask.get_try_num(),
                    'time_submit': now_str,
                    'user_at_host': owner_at_host,
                    'batch_sys_name': itask.summary['batch_sys_name'],
                })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SuiteProcContext(
                            self.JOBS_SUBMIT,
                            '(init %s)' % owner_at_host,
                            err=REMOTE_INIT_FAILED,
                            ret_code=1),
                        suite, itask.point, itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if cylc.flags.debug:
                cmd.append('--debug')
            if cylc.flags.utc:
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [
                    ('host', host, is_remote_host),
                    ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, 'suite job log directory', host, owner))
            stdin_file_paths = []
            job_log_dirs = []
            for itask in sorted(itasks, key=lambda itask: itask.identity):
                if remote_mode:
                    stdin_file_paths.append(
                        get_task_job_job_log(
                            suite, itask.point, itask.tdef.name,
                            itask.submit_num))
                job_log_dirs.append(get_task_job_id(
                    itask.point, itask.tdef.name, itask.submit_num))
                # The job file is now (about to be) used: reset the file write
                # flag so that subsequent manual retrigger will generate a new
                # job file.
                itask.local_job_file_path = None
                itask.state.reset_state(TASK_STATUS_READY)
                if itask.state.outputs.has_custom_triggers():
                    self.suite_db_mgr.put_update_task_outputs(itask)
            cmd += job_log_dirs
            self.proc_pool.put_command(
                SuiteProcContext(
                    self.JOBS_SUBMIT,
                    cmd,
                    stdin_file_paths=stdin_file_paths,
                    job_log_dirs=job_log_dirs,
                    **kwargs
                ),
                self._submit_task_jobs_callback, [suite, itasks])
        return done_tasks
Example #41
0
 def _run_event_mail_callback(proc_ctx):
     """Callback the mail command for notification of a suite event."""
     if proc_ctx.ret_code:
         LOG.warning(str(proc_ctx))
     else:
         LOG.info(str(proc_ctx))
Example #42
0
    def init_host(self, reg, host, owner):
        """Initialise suite run dir on a user@host.

        Create SUITE_RUN_DIR/log/job/ if necessary.
        Install suite contact environment file.
        Install suite python modules.

        Raise RemoteJobHostInitError if initialisation cannot complete.

        """
        if host is None:
            host = 'localhost'
        if ((host, owner) in [('localhost', None), ('localhost', USER)] or
                (host, owner) in self.init_host_map or self.single_task_mode):
            return
        user_at_host = host
        if owner:
            user_at_host = owner + '@' + host

        r_suite_run_dir = GLOBAL_CFG.get_derived_host_item(
            reg, 'suite run directory', host, owner)
        r_log_job_dir = GLOBAL_CFG.get_derived_host_item(
            reg, 'suite job log directory', host, owner)
        r_suite_srv_dir = os.path.join(
            r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV)

        # Create a UUID file in the service directory.
        # If remote host has the file in its service directory, we can assume
        # that the remote host has a shared file system with the suite host.
        ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner)
        uuid_str = str(uuid4())
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str)
        try:
            open(uuid_fname, 'wb').close()
            proc = Popen(
                shlex.split(ssh_tmpl) + [
                    '-n', user_at_host,
                    'test', '-e', os.path.join(r_suite_srv_dir, uuid_str)],
                stdout=PIPE, stderr=PIPE)
            if proc.wait() == 0:
                # Initialised, but no need to tidy up
                self.init_host_map[(host, owner)] = False
                return
        finally:
            try:
                os.unlink(uuid_fname)
            except OSError:
                pass

        cmds = []
        # Command to create suite directory structure on remote host.
        cmds.append(shlex.split(ssh_tmpl) + [
            '-n', user_at_host,
            'mkdir', '-p',
            r_suite_run_dir, r_log_job_dir, r_suite_srv_dir])
        # Command to copy contact and authentication files to remote host.
        # Note: no need to do this if task communication method is "poll".
        should_unlink = GLOBAL_CFG.get_host_item(
            'task communication method', host, owner) != "poll"
        if should_unlink:
            scp_tmpl = GLOBAL_CFG.get_host_item('scp command', host, owner)
            cmds.append(shlex.split(scp_tmpl) + [
                '-p',
                self.suite_srv_files_mgr.get_contact_file(reg),
                self.suite_srv_files_mgr.get_auth_item(
                    self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg),
                self.suite_srv_files_mgr.get_auth_item(
                    self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg),
                user_at_host + ':' + r_suite_srv_dir + '/'])
        # Command to copy python library to remote host.
        suite_run_py = os.path.join(
            GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory'),
            'python')
        if os.path.isdir(suite_run_py):
            cmds.append(shlex.split(scp_tmpl) + [
                '-pr',
                suite_run_py, user_at_host + ':' + r_suite_run_dir + '/'])
        # Run commands in sequence.
        for cmd in cmds:
            proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
            out, err = proc.communicate()
            if proc.wait():
                raise RemoteJobHostInitError(
                    RemoteJobHostInitError.MSG_INIT,
                    user_at_host, ' '.join([quote(item) for item in cmd]),
                    proc.returncode, out, err)
        self.init_host_map[(host, owner)] = should_unlink
        LOG.info('Initialised %s:%s' % (user_at_host, r_suite_run_dir))
Example #43
0
 def _run_event_mail_callback(proc_ctx):
     """Callback the mail command for notification of a suite event."""
     if proc_ctx.ret_code:
         LOG.warning(str(proc_ctx))
     else:
         LOG.info(str(proc_ctx))
Example #44
0
    def _prep_submit_task_job_impl(self, suite, itask):
        """Helper for self._prep_submit_task_job."""
        overrides = self.task_events_mgr.broadcast_mgr.get_broadcast(
            itask.identity)
        if overrides:
            rtconfig = pdeepcopy(itask.tdef.rtconfig)
            poverride(rtconfig, overrides)
        else:
            rtconfig = itask.tdef.rtconfig

        # Retry delays, needed for the try_num
        self._set_retry_timers(itask, rtconfig)

        # Submit number and try number
        LOG.debug("[%s] -incrementing submit number" % (itask.identity,))
        itask.submit_num += 1
        itask.summary['submit_num'] = itask.submit_num
        itask.local_job_file_path = None
        self.suite_db_mgr.put_insert_task_jobs(itask, {
            "is_manual_submit": itask.is_manual_submit,
            "try_num": itask.get_try_num(),
            "time_submit": get_current_time_string(),
        })

        itask.summary['batch_sys_name'] = rtconfig['job']['batch system']
        for name in rtconfig['extra log files']:
            itask.summary['logfiles'].append(expandvars(name))

        # Determine task host settings now, just before job submission,
        # because dynamic host selection may be used.

        # host may be None (= run task on suite host)
        itask.task_host = get_task_host(rtconfig['remote']['host'])
        if not itask.task_host:
            itask.task_host = 'localhost'
        elif itask.task_host != "localhost":
            LOG.info("[%s] -Task host: %s" % (
                itask.identity, itask.task_host))

        itask.task_owner = rtconfig['remote']['owner']

        if itask.task_owner:
            user_at_host = itask.task_owner + "@" + itask.task_host
        else:
            user_at_host = itask.task_host
        itask.summary['host'] = user_at_host
        itask.summary['job_hosts'][itask.submit_num] = user_at_host
        try:
            batch_sys_conf = self.task_events_mgr.get_host_conf(
                itask, 'batch systems')[rtconfig['job']['batch system']]
        except (TypeError, KeyError):
            batch_sys_conf = {}
        try:
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float(
                rtconfig['job']['execution time limit'])
        except TypeError:
            pass
        if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]:
            # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10
            # minutes after time limit exceeded
            itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = (
                TaskActionTimer(delays=batch_sys_conf.get(
                    'execution time limit polling intervals', [60, 120, 420])))
        for label, key in [
                ('submission polling intervals', TASK_STATUS_SUBMITTED),
                ('execution polling intervals', TASK_STATUS_RUNNING)]:
            if key in itask.poll_timers:
                itask.poll_timers[key].reset()
            else:
                values = self.task_events_mgr.get_host_conf(
                    itask, label, skey='job')
                if values:
                    itask.poll_timers[key] = TaskActionTimer(delays=values)

        self.init_host(suite, itask.task_host, itask.task_owner)
        if itask.state.outputs.has_custom_triggers():
            self.suite_db_mgr.put_update_task_outputs(itask)
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "user_at_host": user_at_host,
            "batch_sys_name": itask.summary['batch_sys_name'],
        })
        itask.is_manual_submit = False

        scripts = self._get_job_scripts(itask, rtconfig)

        # Location of job file, etc
        self._create_job_log_path(suite, itask)
        job_d = self.task_events_mgr.get_task_job_id(
            itask.point, itask.tdef.name, itask.submit_num)
        job_file_path = os.path.join(
            GLOBAL_CFG.get_derived_host_item(
                suite, "suite job log directory",
                itask.task_host, itask.task_owner),
            job_d, self.JOB_FILE_BASE)
        return {
            'batch_system_name': rtconfig['job']['batch system'],
            'batch_submit_command_template': (
                rtconfig['job']['batch submit command template']),
            'batch_system_conf': batch_sys_conf,
            'directives': rtconfig['directives'],
            'environment': rtconfig['environment'],
            'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT],
            'env-script': rtconfig['env-script'],
            'err-script': rtconfig['err-script'],
            'host': itask.task_host,
            'init-script': rtconfig['init-script'],
            'job_file_path': job_file_path,
            'job_d': job_d,
            'namespace_hierarchy': itask.tdef.namespace_hierarchy,
            'owner': itask.task_owner,
            'param_var': itask.tdef.param_var,
            'post-script': scripts[2],
            'pre-script': scripts[0],
            'remote_suite_d': rtconfig['remote']['suite definition directory'],
            'script': scripts[1],
            'shell': rtconfig['job']['shell'],
            'submit_num': itask.submit_num,
            'suite_name': suite,
            'task_id': itask.identity,
            'try_num': itask.get_try_num(),
            'work_d': rtconfig['work sub-directory'],
        }
Example #45
0
    def init_host(self, reg, host, owner):
        """Initialise suite run dir on a user@host.

        Create SUITE_RUN_DIR/log/job/ if necessary.
        Install suite contact environment file.
        Install suite python modules.

        Raise RemoteJobHostInitError if initialisation cannot complete.

        """
        if host is None:
            host = 'localhost'
        if (self.single_task_mode or
                (host, owner) in self.init_host_map or
                not is_remote(host, owner)):
            return
        user_at_host = host
        if owner:
            user_at_host = owner + '@' + host

        r_suite_run_dir = GLOBAL_CFG.get_derived_host_item(
            reg, 'suite run directory', host, owner)
        r_log_job_dir = GLOBAL_CFG.get_derived_host_item(
            reg, 'suite job log directory', host, owner)
        r_suite_srv_dir = os.path.join(
            r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV)

        # Create a UUID file in the service directory.
        # If remote host has the file in its service directory, we can assume
        # that the remote host has a shared file system with the suite host.
        ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner)
        uuid_str = str(uuid4())
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str)
        try:
            open(uuid_fname, 'wb').close()
            proc = Popen(
                shlex.split(ssh_tmpl) + [
                    '-n', user_at_host,
                    'test', '-e', os.path.join(r_suite_srv_dir, uuid_str)],
                stdout=PIPE, stderr=PIPE)
            if proc.wait() == 0:
                # Initialised, but no need to tidy up
                self.init_host_map[(host, owner)] = False
                return
        finally:
            try:
                os.unlink(uuid_fname)
            except OSError:
                pass

        cmds = []
        # Command to create suite directory structure on remote host.
        cmds.append(shlex.split(ssh_tmpl) + [
            '-n', user_at_host,
            'mkdir', '-p',
            r_suite_run_dir, r_log_job_dir, r_suite_srv_dir])
        # Command to copy contact and authentication files to remote host.
        # Note: no need to do this if task communication method is "poll".
        should_unlink = GLOBAL_CFG.get_host_item(
            'task communication method', host, owner) != "poll"
        if should_unlink:
            scp_tmpl = GLOBAL_CFG.get_host_item('scp command', host, owner)
            # Handle not having SSL certs installed.
            try:
                ssl_cert = self.suite_srv_files_mgr.get_auth_item(
                    self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg)
            except (SuiteServiceFileError, ValueError):
                ssl_cert = None
            cmds.append(shlex.split(scp_tmpl) + [
                '-p',
                self.suite_srv_files_mgr.get_contact_file(reg),
                self.suite_srv_files_mgr.get_auth_item(
                    self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg),
                user_at_host + ':' + r_suite_srv_dir + '/'])
            if ssl_cert is not None:
                cmds[-1].insert(-1, ssl_cert)
        # Command to copy python library to remote host.
        suite_run_py = os.path.join(
            GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory'),
            'python')
        if os.path.isdir(suite_run_py):
            cmds.append(shlex.split(scp_tmpl) + [
                '-pr',
                suite_run_py, user_at_host + ':' + r_suite_run_dir + '/'])
        # Run commands in sequence.
        for cmd in cmds:
            proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
            out, err = proc.communicate()
            if proc.wait():
                raise RemoteJobHostInitError(
                    RemoteJobHostInitError.MSG_INIT,
                    user_at_host, ' '.join(quote(item) for item in cmd),
                    proc.returncode, out, err)
        self.init_host_map[(host, owner)] = should_unlink
        LOG.info('Initialised %s:%s' % (user_at_host, r_suite_run_dir))
Example #46
0
    def _prep_submit_task_job_impl(self, suite, itask):
        """Helper for self._prep_submit_task_job."""
        overrides = BroadcastServer.get_inst().get(itask.identity)
        if overrides:
            rtconfig = pdeepcopy(itask.tdef.rtconfig)
            poverride(rtconfig, overrides)
        else:
            rtconfig = itask.tdef.rtconfig

        # Retry delays, needed for the try_num
        self._set_retry_timers(itask, rtconfig)

        # Submit number and try number
        LOG.debug("[%s] -incrementing submit number" % (itask.identity,))
        itask.submit_num += 1
        itask.summary['submit_num'] = itask.submit_num
        itask.local_job_file_path = None
        self.suite_db_mgr.put_insert_task_jobs(itask, {
            "is_manual_submit": itask.is_manual_submit,
            "try_num": itask.get_try_num(),
            "time_submit": get_current_time_string(),
        })

        itask.summary['batch_sys_name'] = rtconfig['job']['batch system']
        for name in rtconfig['extra log files']:
            itask.summary['logfiles'].append(expandvars(name))

        # Determine task host settings now, just before job submission,
        # because dynamic host selection may be used.

        # host may be None (= run task on suite host)
        itask.task_host = get_task_host(rtconfig['remote']['host'])
        if not itask.task_host:
            itask.task_host = 'localhost'
        elif itask.task_host != "localhost":
            LOG.info("[%s] -Task host: %s" % (
                itask.identity, itask.task_host))

        itask.task_owner = rtconfig['remote']['owner']

        if itask.task_owner:
            user_at_host = itask.task_owner + "@" + itask.task_host
        else:
            user_at_host = itask.task_host
        itask.summary['host'] = user_at_host
        itask.summary['job_hosts'][itask.submit_num] = user_at_host
        try:
            batch_sys_conf = self.task_events_mgr.get_host_conf(
                itask, 'batch systems')[rtconfig['job']['batch system']]
        except (TypeError, KeyError):
            batch_sys_conf = {}
        try:
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float(
                rtconfig['job']['execution time limit'])
        except TypeError:
            pass
        if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]:
            # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10
            # minutes after time limit exceeded
            itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = (
                TaskActionTimer(delays=batch_sys_conf.get(
                    'execution time limit polling intervals', [60, 120, 420])))
        for label, key in [
                ('submission polling intervals', TASK_STATUS_SUBMITTED),
                ('execution polling intervals', TASK_STATUS_RUNNING)]:
            if key in itask.poll_timers:
                itask.poll_timers[key].reset()
            else:
                values = self.task_events_mgr.get_host_conf(
                    itask, label, skey='job')
                if values:
                    itask.poll_timers[key] = TaskActionTimer(delays=values)

        self.init_host(suite, itask.task_host, itask.task_owner)
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "user_at_host": user_at_host,
            "batch_sys_name": itask.summary['batch_sys_name'],
        })
        itask.is_manual_submit = False

        scripts = self._get_job_scripts(itask, rtconfig)

        # Location of job file, etc
        self._create_job_log_path(suite, itask)
        job_d = self.task_events_mgr.get_task_job_id(
            itask.point, itask.tdef.name, itask.submit_num)
        job_file_path = os.path.join(
            GLOBAL_CFG.get_derived_host_item(
                suite, "suite job log directory",
                itask.task_host, itask.task_owner),
            job_d, self.JOB_FILE_BASE)
        return {
            'batch_system_name': rtconfig['job']['batch system'],
            'batch_submit_command_template': (
                rtconfig['job']['batch submit command template']),
            'batch_system_conf': batch_sys_conf,
            'directives': rtconfig['directives'],
            'environment': rtconfig['environment'],
            'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT],
            'env-script': rtconfig['env-script'],
            'err-script': rtconfig['err-script'],
            'host': itask.task_host,
            'init-script': rtconfig['init-script'],
            'job_file_path': job_file_path,
            'job_d': job_d,
            'namespace_hierarchy': itask.tdef.namespace_hierarchy,
            'owner': itask.task_owner,
            'param_var': itask.tdef.param_var,
            'post-script': scripts[2],
            'pre-script': scripts[0],
            'remote_suite_d': rtconfig['remote']['suite definition directory'],
            'script': scripts[1],
            'shell': rtconfig['job']['shell'],
            'submit_num': itask.submit_num,
            'suite_name': suite,
            'task_id': itask.identity,
            'try_num': itask.get_try_num(),
            'work_d': rtconfig['work sub-directory'],
        }
Example #47
0
 def _reset_job_timers(self, itask):
     """Set up poll timer and timeout for task."""
     if itask.state.status not in TASK_STATUSES_ACTIVE:
         # Reset, task not active
         itask.timeout = None
         itask.poll_timer = None
         return
     ctx = (itask.submit_num, itask.state.status)
     if itask.poll_timer and itask.poll_timer.ctx == ctx:
         return
     # Set poll timer
     # Set timeout
     timeref = None  # reference time, submitted or started time
     timeout = None  # timeout in setting
     if itask.state.status == TASK_STATUS_RUNNING:
         timeref = itask.summary['started_time']
         timeout_key = 'execution timeout'
         timeout = self._get_events_conf(itask, timeout_key)
         delays = self.get_host_conf(
             itask, 'execution polling intervals', skey='job',
             default=[900])  # Default 15 minute intervals
         if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]:
             time_limit = itask.summary[self.KEY_EXECUTE_TIME_LIMIT]
             try:
                 host_conf = self.get_host_conf(itask, 'batch systems')
                 batch_sys_conf = host_conf[itask.summary['batch_sys_name']]
             except (TypeError, KeyError):
                 batch_sys_conf = {}
             time_limit_delays = batch_sys_conf.get(
                 'execution time limit polling intervals', [60, 120, 420])
             timeout = time_limit + sum(time_limit_delays)
             # Remove execessive polling before time limit
             while sum(delays) > time_limit:
                 del delays[-1]
             # But fill up the gap before time limit
             if delays:
                 size = int((time_limit - sum(delays)) / delays[-1])
                 delays.extend([delays[-1]] * size)
             time_limit_delays[0] += time_limit - sum(delays)
             delays += time_limit_delays
     else:  # if itask.state.status == TASK_STATUS_SUBMITTED:
         timeref = itask.summary['submitted_time']
         timeout_key = 'submission timeout'
         timeout = self._get_events_conf(itask, timeout_key)
         delays = self.get_host_conf(
             itask, 'submission polling intervals', skey='job',
             default=[900])  # Default 15 minute intervals
     try:
         itask.timeout = timeref + float(timeout)
         timeout_str = intvl_as_str(timeout)
     except (TypeError, ValueError):
         itask.timeout = None
         timeout_str = None
     itask.poll_timer = TaskActionTimer(ctx=ctx, delays=delays)
     # Log timeout and polling schedule
     message = 'health check settings: %s=%s' % (timeout_key, timeout_str)
     # Attempt to group idenitical consecutive delays as N*DELAY,...
     if itask.poll_timer.delays:
         items = []  # [(number of item - 1, item), ...]
         for delay in itask.poll_timer.delays:
             if items and items[-1][1] == delay:
                 items[-1][0] += 1
             else:
                 items.append([0, delay])
         message += ', polling intervals='
         for num, item in items:
             if num:
                 message += '%d*' % (num + 1)
             message += '%s,' % intvl_as_str(item)
         message += '...'
     LOG.info(message, itask=itask)
     # Set next poll time
     self.check_poll_time(itask)
Example #48
0
 def _reset_job_timers(self, itask):
     """Set up poll timer and timeout for task."""
     if itask.state.status not in TASK_STATUSES_ACTIVE:
         # Reset, task not active
         itask.timeout = None
         itask.poll_timer = None
         return
     ctx = (itask.submit_num, itask.state.status)
     if itask.poll_timer and itask.poll_timer.ctx == ctx:
         return
     # Set poll timer
     # Set timeout
     timeref = None  # reference time, submitted or started time
     timeout = None  # timeout in setting
     if itask.state.status == TASK_STATUS_RUNNING:
         timeref = itask.summary['started_time']
         timeout_key = 'execution timeout'
         timeout = self._get_events_conf(itask, timeout_key)
         delays = self.get_host_conf(
             itask,
             'execution polling intervals',
             skey='job',
             default=[900])  # Default 15 minute intervals
         if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]:
             time_limit = itask.summary[self.KEY_EXECUTE_TIME_LIMIT]
             try:
                 host_conf = self.get_host_conf(itask, 'batch systems')
                 batch_sys_conf = host_conf[itask.summary['batch_sys_name']]
             except (TypeError, KeyError):
                 batch_sys_conf = {}
             time_limit_delays = batch_sys_conf.get(
                 'execution time limit polling intervals', [60, 120, 420])
             timeout = time_limit + sum(time_limit_delays)
             # Remove execessive polling before time limit
             while sum(delays) > time_limit:
                 del delays[-1]
             # But fill up the gap before time limit
             if delays:
                 size = int((time_limit - sum(delays)) / delays[-1])
                 delays.extend([delays[-1]] * size)
             time_limit_delays[0] += time_limit - sum(delays)
             delays += time_limit_delays
     else:  # if itask.state.status == TASK_STATUS_SUBMITTED:
         timeref = itask.summary['submitted_time']
         timeout_key = 'submission timeout'
         timeout = self._get_events_conf(itask, timeout_key)
         delays = self.get_host_conf(
             itask,
             'submission polling intervals',
             skey='job',
             default=[900])  # Default 15 minute intervals
     try:
         itask.timeout = timeref + float(timeout)
         timeout_str = intvl_as_str(timeout)
     except (TypeError, ValueError):
         itask.timeout = None
         timeout_str = None
     itask.poll_timer = TaskActionTimer(ctx=ctx, delays=delays)
     # Log timeout and polling schedule
     message = 'health check settings: %s=%s' % (timeout_key, timeout_str)
     # Attempt to group idenitical consecutive delays as N*DELAY,...
     if itask.poll_timer.delays:
         items = []  # [(number of item - 1, item), ...]
         for delay in itask.poll_timer.delays:
             if items and items[-1][1] == delay:
                 items[-1][0] += 1
             else:
                 items.append([0, delay])
         message += ', polling intervals='
         for num, item in items:
             if num:
                 message += '%d*' % (num + 1)
             message += '%s,' % intvl_as_str(item)
         message += '...'
     LOG.info(message, itask=itask)
     # Set next poll time
     self.check_poll_time(itask)
Example #49
0
    def init_suite_run_dir(self, reg, user_at_host):
        """Initialise suite run dir on a user@host.

        Create SUITE_RUN_DIR/log/job/ if necessary.
        Install suite contact environment file.
        Install suite python modules.

        Raise RemoteJobHostInitError if initialisation cannot complete.

        """
        if "@" in user_at_host:
            owner, host = user_at_host.split("@", 1)
        else:
            owner, host = None, user_at_host
        if (
            (owner, host) in [(None, "localhost"), (USER, "localhost")]
            or host in self.initialised_hosts
            or self.single_task_mode
        ):
            return

        r_suite_run_dir = GLOBAL_CFG.get_derived_host_item(reg, "suite run directory", host, owner)
        r_log_job_dir = GLOBAL_CFG.get_derived_host_item(reg, "suite job log directory", host, owner)
        r_suite_srv_dir = os.path.join(r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV)

        # Create a UUID file in the service directory.
        # If remote host has the file in its service directory, we can assume
        # that the remote host has a shared file system with the suite host.
        ssh_tmpl = GLOBAL_CFG.get_host_item("remote shell template", host, owner)
        uuid_str = str(uuid4())
        uuid_fname = os.path.join(self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str)
        try:
            open(uuid_fname, "wb").close()
            proc = Popen(
                shlex.split(ssh_tmpl) + ["-n", user_at_host, "test", "-e", os.path.join(r_suite_srv_dir, uuid_str)],
                stdout=PIPE,
                stderr=PIPE,
            )
            if proc.wait() == 0:
                # Initialised, but no need to tidy up
                self.initialised_hosts[user_at_host] = False
                return
        finally:
            try:
                os.unlink(uuid_fname)
            except OSError:
                pass

        cmds = []
        # Command to create suite directory structure on remote host.
        cmds.append(
            shlex.split(ssh_tmpl) + ["-n", user_at_host, "mkdir", "-p", r_suite_run_dir, r_log_job_dir, r_suite_srv_dir]
        )
        # Command to copy contact and authentication files to remote host.
        # Note: no need to do this if task communication method is "poll".
        should_unlink = GLOBAL_CFG.get_host_item("task communication method", host, owner) != "poll"
        if should_unlink:
            scp_tmpl = GLOBAL_CFG.get_host_item("remote copy template", host, owner)
            cmds.append(
                shlex.split(scp_tmpl)
                + [
                    "-p",
                    self.suite_srv_files_mgr.get_contact_file(reg),
                    self.suite_srv_files_mgr.get_auth_item(self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg),
                    self.suite_srv_files_mgr.get_auth_item(self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg),
                    user_at_host + ":" + r_suite_srv_dir + "/",
                ]
            )
        # Command to copy python library to remote host.
        suite_run_py = os.path.join(GLOBAL_CFG.get_derived_host_item(reg, "suite run directory"), "python")
        if os.path.isdir(suite_run_py):
            cmds.append(shlex.split(scp_tmpl) + ["-pr", suite_run_py, user_at_host + ":" + r_suite_run_dir + "/"])
        # Run commands in sequence.
        for cmd in cmds:
            proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
            out, err = proc.communicate()
            if proc.wait():
                raise RemoteJobHostInitError(
                    RemoteJobHostInitError.MSG_INIT,
                    user_at_host,
                    " ".join([quote(item) for item in cmd]),
                    proc.returncode,
                    out,
                    err,
                )
        self.initialised_hosts[user_at_host] = should_unlink
        LOG.info("Initialised %s:%s" % (user_at_host, r_suite_run_dir))