Beispiel #1
0
    def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None):
        """Poll jobs of specified tasks.

        Any job that is or was submitted or running can be polled, except for
        retrying tasks - which would poll (correctly) as failed. And don't poll
        succeeded tasks by default.

        This method uses _poll_task_jobs_callback() and
        _manip_task_jobs_callback() as help/callback methods.

        _poll_task_job_callback() executes one specific job.
        """
        to_poll_tasks = []
        pollable_statuses = set([
            TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED])
        if poll_succ:
            pollable_statuses.add(TASK_STATUS_SUCCEEDED)
        for itask in itasks:
            if itask.state.status in pollable_statuses:
                to_poll_tasks.append(itask)
            else:
                LOG.debug("skipping %s: not pollable, "
                          "or skipping 'succeeded' tasks" % itask.identity)
        if to_poll_tasks:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(
                self.JOBS_POLL, suite, to_poll_tasks,
                self._poll_task_jobs_callback)
Beispiel #2
0
    def _upgrade_with_state_file_extras(self):
        """Upgrade the database tables after reading in state file."""
        conn = self.connect()

        # Rename old tables
        for t_name in [self.TABLE_TASK_STATES, self.TABLE_TASK_EVENTS]:
            conn.execute(r"ALTER TABLE " + t_name + r" RENAME TO " + t_name +
                         "_old")
        conn.commit()

        # Create tables with new columns
        self.create_tables()

        # Populate new tables using old column data
        for t_name in [self.TABLE_TASK_STATES, self.TABLE_TASK_EVENTS]:
            LOG.info(r"Upgrading %s table", t_name)
            column_names = [col.name for col in self.tables[t_name].columns]
            for i, row in enumerate(
                    conn.execute(r"SELECT " + ",".join(column_names) +
                                 " FROM " + t_name + "_old")):
                # These tables can be big, so we don't want to queue the items
                # in memory.
                conn.execute(self.tables[t_name].get_insert_stmt(), list(row))
        conn.commit()

        # Drop old tables
        for t_name in [self.TABLE_TASK_STATES, self.TABLE_TASK_EVENTS]:
            conn.execute(r"DROP TABLE " + t_name + "_old")
        conn.commit()
Beispiel #3
0
 def _process_message_failed(self, itask, event_time, message):
     """Helper for process_message, handle a failed message."""
     if event_time is None:
         event_time = get_current_time_string()
     itask.set_summary_time('finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 1,
         "time_run_exit": event_time,
     })
     if (TASK_STATUS_RETRYING not in itask.try_timers
             or itask.try_timers[TASK_STATUS_RETRYING].next() is None):
         # No retry lined up: definitive failure.
         self.pflag = True
         if itask.state.reset_state(TASK_STATUS_FAILED):
             self.setup_event_handlers(itask, "failed", message)
         LOG.critical("[%s] -job(%02d) %s", itask, itask.submit_num,
                      "failed")
     else:
         # There is a retry lined up
         delay_msg = "retrying in %s" % (
             itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str())
         msg = "failed, %s" % (delay_msg)
         LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
         itask.set_summary_message(msg)
         if itask.state.reset_state(TASK_STATUS_RETRYING):
             self.setup_event_handlers(
                 itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg))
     self._reset_job_timers(itask)
Beispiel #4
0
 def _process_message_submit_failed(self, itask, event_time):
     """Helper for process_message, handle a submit-failed message."""
     LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED)
     if event_time is None:
         event_time = get_current_time_string()
     self.suite_db_mgr.put_update_task_jobs(
         itask, {
             "time_submit_exit": get_current_time_string(),
             "submit_status": 1,
         })
     itask.summary['submit_method_id'] = None
     if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or
             itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None):
         # No submission retry lined up: definitive failure.
         self.pflag = True
         # See github #476.
         if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED):
             self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED,
                                       'job %s' % self.EVENT_SUBMIT_FAILED)
     else:
         # There is a submission retry lined up.
         timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING]
         delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str()
         msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)
         LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
         itask.set_summary_message(msg)
         if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING):
             self.setup_event_handlers(
                 itask, self.EVENT_SUBMIT_RETRY,
                 "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg))
     self._reset_job_timers(itask)
Beispiel #5
0
 def _process_message_failed(self, itask, event_time, message):
     """Helper for process_message, handle a failed message."""
     if event_time is None:
         event_time = get_current_time_string()
     itask.set_summary_time('finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 1,
         "time_run_exit": event_time,
     })
     if (TASK_STATUS_RETRYING not in itask.try_timers or
             itask.try_timers[TASK_STATUS_RETRYING].next() is None):
         # No retry lined up: definitive failure.
         self.pflag = True
         if itask.state.reset_state(TASK_STATUS_FAILED):
             self.setup_event_handlers(itask, "failed", message)
         LOG.critical(
             "[%s] -job(%02d) %s", itask, itask.submit_num, "failed")
     else:
         # There is a retry lined up
         delay_msg = "retrying in %s" % (
             itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str())
         msg = "failed, %s" % (delay_msg)
         LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
         itask.set_summary_message(msg)
         if itask.state.reset_state(TASK_STATUS_RETRYING):
             self.setup_event_handlers(
                 itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg))
     self._reset_job_timers(itask)
Beispiel #6
0
 def _process_message_submit_failed(self, itask, event_time):
     """Helper for process_message, handle a submit-failed message."""
     LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED)
     if event_time is None:
         event_time = get_current_time_string()
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "time_submit_exit": event_time,
         "submit_status": 1,
     })
     itask.summary['submit_method_id'] = None
     self.pflag = True
     if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or
             itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None):
         # No submission retry lined up: definitive failure.
         # See github #476.
         if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED):
             self.setup_event_handlers(
                 itask, self.EVENT_SUBMIT_FAILED,
                 'job %s' % self.EVENT_SUBMIT_FAILED)
     else:
         # There is a submission retry lined up.
         timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING]
         delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str()
         msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)
         LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
         itask.set_summary_message(msg)
         if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING):
             self.setup_event_handlers(
                 itask, self.EVENT_SUBMIT_RETRY,
                 "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg))
     self._reset_job_timers(itask)
Beispiel #7
0
 def _process_message_succeeded(self, itask, event_time):
     """Helper for process_message, handle a succeeded message."""
     self.pflag = True
     itask.set_summary_time('finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 0,
         "time_run_exit": event_time,
     })
     # Update mean elapsed time only on task succeeded.
     if itask.summary['started_time'] is not None:
         itask.tdef.elapsed_times.append(
             itask.summary['finished_time'] -
             itask.summary['started_time'])
     if not itask.state.outputs.all_completed():
         msg = ""
         for output in itask.state.outputs.get_not_completed():
             if output not in [TASK_OUTPUT_EXPIRED,
                               TASK_OUTPUT_SUBMIT_FAILED,
                               TASK_OUTPUT_FAILED]:
                 msg += "\n  " + output
         if msg:
             LOG.info(
                 "[%s] -Succeeded with outputs not completed: %s",
                 itask, msg)
     if itask.state.reset_state(TASK_STATUS_SUCCEEDED):
         self.setup_event_handlers(itask, "succeeded", "job succeeded")
     self._reset_job_timers(itask)
Beispiel #8
0
    def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None):
        """Poll jobs of specified tasks.

        Any job that is or was submitted or running can be polled, except for
        retrying tasks - which would poll (correctly) as failed. And don't poll
        succeeded tasks by default.

        This method uses _poll_task_jobs_callback() and
        _manip_task_jobs_callback() as help/callback methods.

        _poll_task_job_callback() executes one specific job.
        """
        to_poll_tasks = []
        pollable_statuses = set([
            TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED])
        if poll_succ:
            pollable_statuses.add(TASK_STATUS_SUCCEEDED)
        for itask in itasks:
            if itask.state.status in pollable_statuses:
                to_poll_tasks.append(itask)
            else:
                LOG.debug("skipping %s: not pollable, "
                          "or skipping 'succeeded' tasks" % itask.identity)
        if to_poll_tasks:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(
                self.JOBS_POLL, suite, to_poll_tasks,
                self._poll_task_jobs_callback)
Beispiel #9
0
    def _check_access_priv_and_report(self,
                                      required_privilege_level,
                                      log_info=True):
        """Check access privilege and log requests with identifying info.

        In debug mode log all requests including task messages. Otherwise log
        all user commands, and just the first info command from each client.

        Return:
            dict: containing the client session

        """
        self._check_access_priv(required_privilege_level)
        command = inspect.currentframe().f_back.f_code.co_name
        auth_user, prog_name, user, host, uuid = _get_client_info()
        priv_level = self._get_priv_level(auth_user)
        LOG.debug(self.LOG_CONNECT_ALLOWED_TMPL, user, host, prog_name,
                  priv_level, uuid)
        if cylc.flags.debug or uuid not in self.clients and log_info:
            LOG.info(self.LOG_COMMAND_TMPL, command, user, host, prog_name,
                     uuid)
        self.clients.setdefault(uuid, {})
        self.clients[uuid]['time'] = time()
        self._housekeep()
        return self.clients[uuid]
Beispiel #10
0
 def load_db_broadcast_states(self, row_idx, row):
     """Load broadcast variables from runtime DB broadcast states row."""
     if row_idx == 0:
         LOG.info("LOADING broadcast states")
     point, namespace, key, value = row
     sections = []
     cur_key = key
     if "]" in cur_key:
         sections = self.REC_SECTION.findall(cur_key)
         cur_key = cur_key.rsplit(r"]", 1)[-1]
     with self.lock:
         self.broadcasts.setdefault(point, {})
         self.broadcasts[point].setdefault(namespace, {})
         dict_ = self.broadcasts[point][namespace]
         for section in sections:
             dict_.setdefault(section, {})
             dict_ = dict_[section]
         dict_[cur_key] = value
     LOG.info(
         CHANGE_FMT.strip() % {
             "change": CHANGE_PREFIX_SET,
             "point": point,
             "namespace": namespace,
             "key": key,
             "value": value
         })
Beispiel #11
0
 def _upgrade_with_state_file_header(self, line):
     """Parse a header line in state file, add information to DB."""
     head, tail = line.split(" : ", 1)
     if head == "time":
         self.add_insert_item(
             self.TABLE_CHECKPOINT_ID, {
                 "id": self.CHECKPOINT_LATEST_ID,
                 "time": tail.split(" ", 1)[0],
                 "event": self.CHECKPOINT_LATEST_EVENT
             })
         return
     for name, key in [("run mode", "run_mode"),
                       ("initial cycle", "initial_point"),
                       ("final cycle", "final_point")]:
         if tail == "None":
             tail = None
         if head == name:
             self.add_insert_item(self.TABLE_SUITE_PARAMS, {
                 "key": key,
                 "value": tail
             })
             LOG.info(" + %s=%s", key, tail)
             if name == "final cycle":
                 return "broadcast"
             else:
                 return
Beispiel #12
0
 def _process_message_succeeded(self, itask, event_time):
     """Helper for process_message, handle a succeeded message."""
     self.pflag = True
     itask.set_summary_time('finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 0,
         "time_run_exit": event_time,
     })
     # Update mean elapsed time only on task succeeded.
     if itask.summary['started_time'] is not None:
         itask.tdef.elapsed_times.append(itask.summary['finished_time'] -
                                         itask.summary['started_time'])
     if not itask.state.outputs.all_completed():
         msg = ""
         for output in itask.state.outputs.get_not_completed():
             if output not in [
                     TASK_OUTPUT_EXPIRED, TASK_OUTPUT_SUBMIT_FAILED,
                     TASK_OUTPUT_FAILED
             ]:
                 msg += "\n  " + output
         if msg:
             LOG.info("[%s] -Succeeded with outputs not completed: %s",
                      itask, msg)
     if itask.state.reset_state(TASK_STATUS_SUCCEEDED):
         self.setup_event_handlers(itask, "succeeded", "job succeeded")
     self._reset_job_timers(itask)
Beispiel #13
0
    def clear_broadcast(self,
                        point_strings=None,
                        namespaces=None,
                        cancel_settings=None):
        """Clear broadcasts globally, or for listed namespaces and/or points.

        Return a tuple (modified_settings, bad_options), where:
        * modified_settings is similar to the return value of the "put" method,
          but for removed broadcasts.
        * bad_options is a dict in the form:
              {"point_strings": ["20020202", ..."], ...}
          The dict is only populated if there are options not associated with
          previous broadcasts. The keys can be:
          * point_strings: a list of bad point strings.
          * namespaces: a list of bad namespaces.
          * cancel: a list of tuples. Each tuple contains the keys of a bad
            setting.
        """
        # If cancel_settings defined, only clear specific broadcasts
        cancel_keys_list = self._settings_to_keys_list(cancel_settings)

        # Clear broadcasts
        modified_settings = []
        with self.lock:
            for point_string, point_string_settings in self.broadcasts.items():
                if point_strings and point_string not in point_strings:
                    continue
                for namespace, namespace_settings in (
                        point_string_settings.items()):
                    if namespaces and namespace not in namespaces:
                        continue
                    stuff_stack = [([], namespace_settings)]
                    while stuff_stack:
                        keys, stuff = stuff_stack.pop()
                        for key, value in stuff.items():
                            if isinstance(value, dict):
                                stuff_stack.append((keys + [key], value))
                            elif (not cancel_keys_list
                                  or keys + [key] in cancel_keys_list):
                                stuff[key] = None
                                setting = {key: value}
                                for rkey in reversed(keys):
                                    setting = {rkey: setting}
                                modified_settings.append(
                                    (point_string, namespace, setting))

        # Prune any empty branches
        bad_options = self._get_bad_options(self._prune(), point_strings,
                                            namespaces, cancel_keys_list)

        # Log the broadcast
        self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True)
        LOG.info(get_broadcast_change_report(modified_settings,
                                             is_cancel=True))
        if bad_options:
            LOG.error(get_broadcast_bad_options_report(bad_options))

        return (modified_settings, bad_options)
Beispiel #14
0
 def upgrade_from_611(self):
     """Upgrade database on restart with a 6.11.X private database."""
     conn = self.connect()
     # Add hold_swap column task_pool(_checkpoints) tables
     for t_name in [self.TABLE_TASK_POOL, self.TABLE_TASK_POOL_CHECKPOINTS]:
         LOG.info("Add hold_swap column to %s", t_name)
         conn.execute(r"ALTER TABLE " + t_name +
                      r" ADD COLUMN hold_swap TEXT")
     conn.commit()
Beispiel #15
0
    def clear_broadcast(
            self, point_strings=None, namespaces=None, cancel_settings=None):
        """Clear broadcasts globally, or for listed namespaces and/or points.

        Return a tuple (modified_settings, bad_options), where:
        * modified_settings is similar to the return value of the "put" method,
          but for removed broadcasts.
        * bad_options is a dict in the form:
              {"point_strings": ["20020202", ..."], ...}
          The dict is only populated if there are options not associated with
          previous broadcasts. The keys can be:
          * point_strings: a list of bad point strings.
          * namespaces: a list of bad namespaces.
          * cancel: a list of tuples. Each tuple contains the keys of a bad
            setting.
        """
        # If cancel_settings defined, only clear specific broadcasts
        cancel_keys_list = self._settings_to_keys_list(cancel_settings)

        # Clear broadcasts
        modified_settings = []
        with self.lock:
            for point_string, point_string_settings in self.broadcasts.items():
                if point_strings and point_string not in point_strings:
                    continue
                for namespace, namespace_settings in (
                        point_string_settings.items()):
                    if namespaces and namespace not in namespaces:
                        continue
                    stuff_stack = [([], namespace_settings)]
                    while stuff_stack:
                        keys, stuff = stuff_stack.pop()
                        for key, value in stuff.items():
                            if isinstance(value, dict):
                                stuff_stack.append((keys + [key], value))
                            elif (not cancel_keys_list or
                                    keys + [key] in cancel_keys_list):
                                stuff[key] = None
                                setting = {key: value}
                                for rkey in reversed(keys):
                                    setting = {rkey: setting}
                                modified_settings.append(
                                    (point_string, namespace, setting))

        # Prune any empty branches
        bad_options = self._get_bad_options(
            self._prune(), point_strings, namespaces, cancel_keys_list)

        # Log the broadcast
        self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True)
        LOG.info(
            get_broadcast_change_report(modified_settings, is_cancel=True))
        if bad_options:
            LOG.error(get_broadcast_bad_options_report(bad_options))

        return (modified_settings, bad_options)
Beispiel #16
0
 def _run_event_handlers_callback(proc_ctx, abort_on_error=False):
     """Callback on completion of a suite event handler."""
     if proc_ctx.ret_code:
         msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1]
         LOG.error(str(proc_ctx))
         LOG.error(msg)
         if abort_on_error:
             raise SuiteEventError(msg)
     else:
         LOG.info(str(proc_ctx))
Beispiel #17
0
 def satisfy_xclock(self, itask):
     """Attempt to satisfy itask's clock trigger, if it has one."""
     label, sig, ctx, satisfied = self._get_xclock(itask)
     if satisfied:
         return
     if wall_clock(*ctx.func_args, **ctx.func_kwargs):
         satisfied = True
         itask.state.xclock = (label, True)
         self.sat_xclock.append(sig)
         LOG.info('clock xtrigger satisfied: %s = %s' % (label, str(ctx)))
Beispiel #18
0
 def satisfy_xclock(self, itask):
     """Attempt to satisfy itask's clock trigger, if it has one."""
     label, sig, ctx, satisfied = self._get_xclock(itask)
     if satisfied:
         return
     if wall_clock(*ctx.func_args, **ctx.func_kwargs):
         satisfied = True
         itask.state.xclock = (label, True)
         self.sat_xclock.append(sig)
         LOG.info('clock xtrigger satisfied: %s = %s' % (label, str(ctx)))
Beispiel #19
0
        def _authorise(self, *args, user='******', meta=None, **kwargs):
            host = meta.get('host', '?')
            prog = meta.get('prog', '?')

            usr_priv_level = self.get_priv_level(user)
            if usr_priv_level < req_priv_level:
                LOG.warn(
                    "[client-connect] DENIED (privilege '%s' < '%s') %s@%s:%s",
                    usr_priv_level, req_priv_level, user, host, prog)
                raise Exception('Authorisation failure')
            LOG.info('[client-command] %s %s@%s:%s', fcn.__name__, user, host,
                     prog)
            return fcn(self, *args, **kwargs)
Beispiel #20
0
    def put_broadcast(self,
                      point_strings=None,
                      namespaces=None,
                      settings=None):
        """Add new broadcast settings (server side interface).

        Return a tuple (modified_settings, bad_options) where:
          modified_settings is list of modified settings in the form:
            [("20200202", "foo", {"script": "true"}, ...]
          bad_options is as described in the docstring for self.clear().
        """
        modified_settings = []
        bad_point_strings = []
        bad_namespaces = []

        with self.lock:
            for setting in settings:
                for point_string in point_strings:
                    # Standardise the point and check its validity.
                    bad_point = False
                    try:
                        point_string = standardise_point_string(point_string)
                    except PointParsingError:
                        if point_string != '*':
                            bad_point_strings.append(point_string)
                            bad_point = True
                    if not bad_point and point_string not in self.broadcasts:
                        self.broadcasts[point_string] = {}
                    for namespace in namespaces:
                        if namespace not in self.linearized_ancestors:
                            bad_namespaces.append(namespace)
                        elif not bad_point:
                            if namespace not in self.broadcasts[point_string]:
                                self.broadcasts[point_string][namespace] = {}
                            self._addict(
                                self.broadcasts[point_string][namespace],
                                setting)
                            modified_settings.append(
                                (point_string, namespace, setting))

        # Log the broadcast
        self.suite_db_mgr.put_broadcast(modified_settings)
        LOG.info(get_broadcast_change_report(modified_settings))

        bad_options = {}
        if bad_point_strings:
            bad_options["point_strings"] = bad_point_strings
        if bad_namespaces:
            bad_options["namespaces"] = bad_namespaces
        return modified_settings, bad_options
Beispiel #21
0
    def test_value_error_raises_system_exit(self, mocked_glbl_cfg):
        """Test that a ValueError when writing to a log stream won't result
        in multiple exceptions (what could lead to infinite loop in some
        occasions. Instead, it **must** raise a SystemExit"""
        with tempfile.NamedTemporaryFile() as tf:
            # mock objects used when creating the file handler
            mocked = mock.MagicMock()
            mocked_glbl_cfg.return_value = mocked
            mocked.get_derived_host_item.return_value = tf.name
            mocked.get.return_value = 100
            file_handler = TimestampRotatingFileHandler("suiteA", False)
            # next line is important as pytest can have a "Bad file descriptor"
            # due to a FileHandler with default "a" (pytest tries to r/w).
            file_handler.mode = "a+"

            # enable the logger
            LOG.setLevel(logging.INFO)
            LOG.addHandler(file_handler)

            # Disable raising uncaught exceptions in logging, due to file
            # handler using stdin.fileno. See the following links for more.
            # https://github.com/pytest-dev/pytest/issues/2276 &
            # https://github.com/pytest-dev/pytest/issues/1585
            logging.raiseExceptions = False

            # first message will initialize the stream and the handler
            LOG.info("What could go")

            # here we change the stream of the handler
            old_stream = file_handler.stream
            file_handler.stream = mock.MagicMock()
            file_handler.stream.seek = mock.MagicMock()
            # in case where
            file_handler.stream.seek.side_effect = ValueError

            try:
                # next call will call the emit method and use the mocked stream
                LOG.info("wrong?!")
                self.fail("Exception SystemError was not raised")
            except SystemExit:
                pass
            finally:
                # clean up
                file_handler.stream = old_stream
                # for log_handler in LOG.handlers:
                #     log_handler.close()
                file_handler.close()
                LOG.removeHandler(file_handler)
                logging.raiseExceptions = True
Beispiel #22
0
    def test_value_error_raises_system_exit(self, mocked_glbl_cfg):
        """Test that a ValueError when writing to a log stream won't result
        in multiple exceptions (what could lead to infinite loop in some
        occasions. Instead, it **must** raise a SystemExit"""
        with tempfile.NamedTemporaryFile() as tf:
            # mock objects used when creating the file handler
            mocked = mock.MagicMock()
            mocked_glbl_cfg.return_value = mocked
            mocked.get_derived_host_item.return_value = tf.name
            mocked.get.return_value = 100
            file_handler = TimestampRotatingFileHandler("suiteA", False)
            # next line is important as pytest can have a "Bad file descriptor"
            # due to a FileHandler with default "a" (pytest tries to r/w).
            file_handler.mode = "a+"

            # enable the logger
            LOG.setLevel(logging.INFO)
            LOG.addHandler(file_handler)

            # Disable raising uncaught exceptions in logging, due to file
            # handler using stdin.fileno. See the following links for more.
            # https://github.com/pytest-dev/pytest/issues/2276 &
            # https://github.com/pytest-dev/pytest/issues/1585
            logging.raiseExceptions = False

            # first message will initialize the stream and the handler
            LOG.info("What could go")

            # here we change the stream of the handler
            old_stream = file_handler.stream
            file_handler.stream = mock.MagicMock()
            file_handler.stream.seek = mock.MagicMock()
            # in case where
            file_handler.stream.seek.side_effect = ValueError

            try:
                # next call will call the emit method and use the mocked stream
                LOG.info("wrong?!")
                self.fail("Exception SystemError was not raised")
            except SystemExit:
                pass
            finally:
                # clean up
                file_handler.stream = old_stream
                # for log_handler in LOG.handlers:
                #     log_handler.close()
                file_handler.close()
                LOG.removeHandler(file_handler)
                logging.raiseExceptions = True
Beispiel #23
0
        def _authorise(self, *args, user='******', meta=None, **kwargs):
            if not meta:
                meta = {}
            host = meta.get('host', '?')
            prog = meta.get('prog', '?')

            usr_priv_level = self._get_priv_level(user)
            if usr_priv_level < req_priv_level:
                LOG.warn(
                    "[client-connect] DENIED (privilege '%s' < '%s') %s@%s:%s",
                    usr_priv_level, req_priv_level, user, host, prog)
                raise Exception('Authorisation failure')
            LOG.info(
                '[client-command] %s %s@%s:%s', fcn.__name__, user, host, prog)
            return fcn(self, *args, **kwargs)
Beispiel #24
0
    def check_task_jobs(self, suite, task_pool):
        """Check submission and execution timeout and polling timers.

        Poll tasks that have timed out and/or have reached next polling time.
        """
        now = time()
        poll_tasks = set()
        for itask in task_pool.get_tasks():
            if self.task_events_mgr.check_job_time(itask, now):
                poll_tasks.add(itask)
                if itask.poll_timer.delay is not None:
                    LOG.info('[%s] -poll now, (next in %s)', itask,
                             itask.poll_timer.delay_timeout_as_str())
        if poll_tasks:
            self.poll_task_jobs(suite, poll_tasks)
Beispiel #25
0
    def put_broadcast(
            self, point_strings=None, namespaces=None, settings=None):
        """Add new broadcast settings (server side interface).

        Return a tuple (modified_settings, bad_options) where:
          modified_settings is list of modified settings in the form:
            [("20200202", "foo", {"script": "true"}, ...]
          bad_options is as described in the docstring for self.clear().
        """
        modified_settings = []
        bad_point_strings = []
        bad_namespaces = []

        with self.lock:
            for setting in settings:
                for point_string in point_strings:
                    # Standardise the point and check its validity.
                    bad_point = False
                    try:
                        point_string = standardise_point_string(point_string)
                    except PointParsingError:
                        if point_string != '*':
                            bad_point_strings.append(point_string)
                            bad_point = True
                    if not bad_point and point_string not in self.broadcasts:
                        self.broadcasts[point_string] = {}
                    for namespace in namespaces:
                        if namespace not in self.linearized_ancestors:
                            bad_namespaces.append(namespace)
                        elif not bad_point:
                            if namespace not in self.broadcasts[point_string]:
                                self.broadcasts[point_string][namespace] = {}
                            self._addict(
                                self.broadcasts[point_string][namespace],
                                setting)
                            modified_settings.append(
                                (point_string, namespace, setting))

        # Log the broadcast
        self.suite_db_mgr.put_broadcast(modified_settings)
        LOG.info(get_broadcast_change_report(modified_settings))

        bad_options = {}
        if bad_point_strings:
            bad_options["point_strings"] = bad_point_strings
        if bad_namespaces:
            bad_options["namespaces"] = bad_namespaces
        return modified_settings, bad_options
Beispiel #26
0
    def upgrade_with_state_file(self, state_file_path):
        """Upgrade database on restart with an old state file.

        Upgrade database from a state file generated by a suite that ran with
        an old cylc version.
        """
        check_points = []
        self.select_checkpoint_id(
            lambda row_idx, row: check_points.append(row),
            self.CHECKPOINT_LATEST_ID)
        if check_points:
            # No need to upgrade if latest check point already exists
            return
        LOG.info("Upgrading suite db with %s ...", state_file_path)
        self._upgrade_with_state_file_states(state_file_path)
        self._upgrade_with_state_file_extras()
Beispiel #27
0
    def check_task_jobs(self, suite, task_pool):
        """Check submission and execution timeout and polling timers.

        Poll tasks that have timed out and/or have reached next polling time.
        """
        now = time()
        poll_tasks = set()
        for itask in task_pool.get_tasks():
            if self.task_events_mgr.check_job_time(itask, now):
                poll_tasks.add(itask)
                if itask.poll_timer.delay is not None:
                    LOG.info(
                        '[%s] -poll now, (next in %s)',
                        itask, itask.poll_timer.delay_timeout_as_str())
        if poll_tasks:
            self.poll_task_jobs(suite, poll_tasks)
Beispiel #28
0
 def _upgrade_with_state_file_tasks(self, line):
     """Parse a task state line in state file, add information to DB."""
     head, tail = line.split(" : ", 1)
     name, cycle = head.split(".")
     status = None
     spawned = None
     for item in tail.split(", "):
         key, value = item.split("=", 1)
         if key == "status":
             status = value
         elif key == "spawned":
             spawned = int(value in ["True", "true"])
     self.add_insert_item(self.TABLE_TASK_POOL, {
         "name": name,
         "cycle": cycle,
         "spawned": spawned,
         "status": status,
         "hold_swap": None})
     LOG.info(" + %s", head)
Beispiel #29
0
    def _upgrade_with_state_file_states(self, state_file_path):
        """Helper for self.upgrade_with_state_file().

        Populate the new database tables with information from state file.
        """
        location = None
        LOG.info("Populating %s table", self.TABLE_SUITE_PARAMS)
        for line in open(state_file_path):
            line = line.strip()
            if location is None:
                # run mode, time stamp, initial cycle, final cycle
                location = self._upgrade_with_state_file_header(line)
            elif location == "broadcast":
                # Ignore broadcast json in state file.
                # The "broadcast_states" table should already be populated.
                if line == "Begin task states":
                    location = "task states"
                    LOG.info("Populating %s table", self.TABLE_TASK_POOL)
            else:
                self._upgrade_with_state_file_tasks(line)
        self.execute_queued_items()
Beispiel #30
0
    def _process_message_submitted(self, itask, event_time):
        """Helper for process_message, handle a submit-succeeded message."""
        try:
            LOG.info(
                '[%s] -job[%02d] submitted to %s:%s[%s]',
                itask,
                itask.summary['submit_num'],
                itask.summary['host'],
                itask.summary['batch_sys_name'],
                itask.summary['submit_method_id'])
        except KeyError:
            pass
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "time_submit_exit": event_time,
            "submit_status": 0,
            "batch_sys_job_id": itask.summary.get('submit_method_id')})

        if itask.tdef.run_mode == 'simulation':
            # Simulate job execution at this point.
            itask.set_summary_time('submitted', event_time)
            itask.set_summary_time('started', event_time)
            itask.state.reset_state(TASK_STATUS_RUNNING)
            itask.state.outputs.set_completion(TASK_OUTPUT_STARTED, True)
            return

        itask.set_summary_time('submitted', event_time)
        # Unset started and finished times in case of resubmission.
        itask.set_summary_time('started')
        itask.set_summary_time('finished')
        itask.set_summary_message(TASK_OUTPUT_SUBMITTED)

        self.pflag = True
        if itask.state.status == TASK_STATUS_READY:
            # The job started message can (rarely) come in before the submit
            # command returns - in which case do not go back to 'submitted'.
            if itask.state.reset_state(TASK_STATUS_SUBMITTED):
                self.setup_event_handlers(
                    itask, TASK_OUTPUT_SUBMITTED, 'job submitted')
            self._reset_job_timers(itask)
Beispiel #31
0
def log_task_job_activity(ctx, suite, point, name, submit_num=None):
    """Log an activity for a task job."""
    ctx_str = str(ctx)
    if not ctx_str:
        return
    if isinstance(ctx.cmd_key, tuple):  # An event handler
        submit_num = ctx.cmd_key[-1]
    job_activity_log = get_task_job_activity_log(suite, point, name,
                                                 submit_num)
    try:
        with open(job_activity_log, "ab") as handle:
            handle.write(ctx_str + '\n')
    except IOError as exc:
        # This happens when there is no job directory, e.g. if job host
        # selection command causes an submission failure, there will be no job
        # directory. In this case, just send the information to the suite log.
        LOG.exception(exc)
        LOG.info(ctx_str)
    if ctx.cmd and ctx.ret_code:
        LOG.error(ctx_str)
    elif ctx.cmd:
        LOG.debug(ctx_str)
Beispiel #32
0
    def _process_message_submitted(self, itask, event_time):
        """Helper for process_message, handle a submit-succeeded message."""
        try:
            LOG.info(
                '[%s] -job[%02d] submitted to %s:%s[%s]',
                itask,
                itask.summary['submit_num'],
                itask.summary['host'],
                itask.summary['batch_sys_name'],
                itask.summary['submit_method_id'])
        except KeyError:
            pass
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "time_submit_exit": event_time,
            "submit_status": 0,
            "batch_sys_job_id": itask.summary.get('submit_method_id')})

        if itask.tdef.run_mode == 'simulation':
            # Simulate job execution at this point.
            itask.set_summary_time('submitted', event_time)
            itask.set_summary_time('started', event_time)
            itask.state.reset_state(TASK_STATUS_RUNNING)
            itask.state.outputs.set_completion(TASK_OUTPUT_STARTED, True)
            return

        itask.set_summary_time('submitted', event_time)
        # Unset started and finished times in case of resubmission.
        itask.set_summary_time('started')
        itask.set_summary_time('finished')
        itask.set_summary_message(TASK_OUTPUT_SUBMITTED)

        self.pflag = True
        if itask.state.status == TASK_STATUS_READY:
            # The job started message can (rarely) come in before the submit
            # command returns - in which case do not go back to 'submitted'.
            if itask.state.reset_state(TASK_STATUS_SUBMITTED):
                self.setup_event_handlers(
                    itask, TASK_OUTPUT_SUBMITTED, 'job submitted')
            self._reset_job_timers(itask)
Beispiel #33
0
def log_task_job_activity(ctx, suite, point, name, submit_num=None):
    """Log an activity for a task job."""
    ctx_str = str(ctx)
    if not ctx_str:
        return
    if isinstance(ctx.cmd_key, tuple):  # An event handler
        submit_num = ctx.cmd_key[-1]
    job_activity_log = get_task_job_activity_log(
        suite, point, name, submit_num)
    try:
        with open(job_activity_log, "ab") as handle:
            handle.write((ctx_str + '\n').encode())
    except IOError as exc:
        # This happens when there is no job directory, e.g. if job host
        # selection command causes an submission failure, there will be no job
        # directory. In this case, just send the information to the suite log.
        LOG.exception(exc)
        LOG.info(ctx_str)
    if ctx.cmd and ctx.ret_code:
        LOG.error(ctx_str)
    elif ctx.cmd:
        LOG.debug(ctx_str)
Beispiel #34
0
    def verify_triggering(self):
        new = self.get_triggered(self.new_loglines)
        ref = self.get_triggered(self.ref_loglines)

        if len(new) == 0:
            raise LogAnalyserError(
                "ERROR: new log contains no triggering info.")

        if len(ref) == 0:
            raise LogAnalyserError(
                "ERROR: reference log contains no triggering info.")

        new.sort()
        ref.sort()

        if new != ref:
            diff = unified_diff(new, ref)
            raise LogAnalyserError(
                "ERROR: triggering is NOT consistent with the reference log:" +
                '\n' + '\n'.join(diff) + '\n')
        else:
            LOG.info(
                "LogAnalyser: triggering is consistent with the reference log")
Beispiel #35
0
 def load_db_broadcast_states(self, row_idx, row):
     """Load broadcast variables from runtime DB broadcast states row."""
     if row_idx == 0:
         LOG.info("LOADING broadcast states")
     point, namespace, key, value = row
     sections = []
     cur_key = key
     if "]" in cur_key:
         sections = self.REC_SECTION.findall(cur_key)
         cur_key = cur_key.rsplit(r"]", 1)[-1]
     with self.lock:
         self.broadcasts.setdefault(point, {})
         self.broadcasts[point].setdefault(namespace, {})
         dict_ = self.broadcasts[point][namespace]
         for section in sections:
             dict_.setdefault(section, {})
             dict_ = dict_[section]
         dict_[cur_key] = value
     LOG.info(CHANGE_FMT.strip() % {
         "change": CHANGE_PREFIX_SET,
         "point": point,
         "namespace": namespace,
         "key": key,
         "value": value})
Beispiel #36
0
 def _reset_job_timers(self, itask):
     """Set up poll timer and timeout for task."""
     if itask.state.status not in TASK_STATUSES_ACTIVE:
         # Reset, task not active
         itask.timeout = None
         itask.poll_timer = None
         return
     ctx = (itask.submit_num, itask.state.status)
     if itask.poll_timer and itask.poll_timer.ctx == ctx:
         return
     # Set poll timer
     # Set timeout
     timeref = None  # reference time, submitted or started time
     timeout = None  # timeout in setting
     if itask.state.status == TASK_STATUS_RUNNING:
         timeref = itask.summary['started_time']
         timeout_key = 'execution timeout'
         timeout = self._get_events_conf(itask, timeout_key)
         delays = list(self.get_host_conf(
             itask, 'execution polling intervals', skey='job',
             default=[900]))  # Default 15 minute intervals
         if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]:
             time_limit = itask.summary[self.KEY_EXECUTE_TIME_LIMIT]
             try:
                 host_conf = self.get_host_conf(itask, 'batch systems')
                 batch_sys_conf = host_conf[itask.summary['batch_sys_name']]
             except (TypeError, KeyError):
                 batch_sys_conf = {}
             time_limit_delays = batch_sys_conf.get(
                 'execution time limit polling intervals', [60, 120, 420])
             timeout = time_limit + sum(time_limit_delays)
             # Remove excessive polling before time limit
             while sum(delays) > time_limit:
                 del delays[-1]
             # But fill up the gap before time limit
             if delays:
                 size = int((time_limit - sum(delays)) / delays[-1])
                 delays.extend([delays[-1]] * size)
             time_limit_delays[0] += time_limit - sum(delays)
             delays += time_limit_delays
     else:  # if itask.state.status == TASK_STATUS_SUBMITTED:
         timeref = itask.summary['submitted_time']
         timeout_key = 'submission timeout'
         timeout = self._get_events_conf(itask, timeout_key)
         delays = list(self.get_host_conf(
             itask, 'submission polling intervals', skey='job',
             default=[900]))  # Default 15 minute intervals
     try:
         itask.timeout = timeref + float(timeout)
         timeout_str = intvl_as_str(timeout)
     except (TypeError, ValueError):
         itask.timeout = None
         timeout_str = None
     itask.poll_timer = TaskActionTimer(ctx=ctx, delays=delays)
     # Log timeout and polling schedule
     message = 'health check settings: %s=%s' % (timeout_key, timeout_str)
     # Attempt to group identical consecutive delays as N*DELAY,...
     if itask.poll_timer.delays:
         items = []  # [(number of item - 1, item), ...]
         for delay in itask.poll_timer.delays:
             if items and items[-1][1] == delay:
                 items[-1][0] += 1
             else:
                 items.append([0, delay])
         message += ', polling intervals='
         for num, item in items:
             if num:
                 message += '%d*' % (num + 1)
             message += '%s,' % intvl_as_str(item)
         message += '...'
     LOG.info('[%s] -%s', itask, message)
     # Set next poll time
     self.check_poll_time(itask)
Beispiel #37
0
 def load_xtrigger_for_restart(self, row_idx, row):
     """Load satisfied xtrigger results from suite DB."""
     if row_idx == 0:
         LOG.info("LOADING satisfied xtriggers")
     sig, results = row
     self.sat_xtrig[sig] = json.loads(results)
Beispiel #38
0
    def upgrade_pickle_to_json(self):
        """Upgrade the database tables if containing pickled objects.

        Back compat for <=7.6.X.
        """
        conn = self.connect()
        t_name = self.TABLE_TASK_ACTION_TIMERS
        if "_pickle" not in self.select_table_schema("table", t_name):
            return

        # Rename old tables
        conn.execute(r"ALTER TABLE %(table)s RENAME TO %(table)s_old" %
                     {"table": t_name})
        conn.commit()

        # Create tables with new columns
        self.create_tables()

        # Populate new tables using old column data
        # Codacy: Pickle library appears to be in use, possible security issue.
        # Use of "pickle" module is for loading data written by <=7.6.X of Cylc
        # in users' own spaces.
        import pickle
        LOG.info(r"Upgrading %s table", t_name)
        cols = []
        for col in self.tables[t_name].columns:
            if col.name in ['ctx_key', 'ctx', 'delays']:
                cols.append(col.name + '_pickle')
            else:
                cols.append(col.name)
        # Codacy: Possible SQL injection vector through string-based query
        # construction.
        # This is highly unlikely - all strings in the construct are from
        # constants in this module.
        for i, row in enumerate(
                conn.execute(r"SELECT " + ",".join(cols) + " FROM " + t_name +
                             "_old")):
            args = []
            try:
                for col, cell in zip(cols, row):
                    if col == "ctx_pickle":
                        # Upgrade pickled namedtuple objects
                        orig = pickle.loads(str(cell))
                        if orig is not None:
                            args.append(
                                json.dumps([
                                    type(orig).__name__,
                                    orig.__getnewargs__()
                                ]))
                        else:
                            args.append(json.dumps(orig))
                    elif col.endswith("_pickle"):
                        # Upgrade pickled lists
                        args.append(json.dumps(pickle.loads(str(cell))))
                    else:
                        args.append(cell)
            except (EOFError, TypeError, LookupError, ValueError):
                pass
            else:
                # These tables can be big, so we don't want to queue the items
                # in memory.
                conn.execute(self.tables[t_name].get_insert_stmt(), args)
        conn.commit()

        # Drop old tables
        conn.execute(r"DROP TABLE %(table)s_old" % {"table": t_name})
        conn.commit()
Beispiel #39
0
    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            if (
                self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name']) and
                not is_remote_host(host)
            ):
                owner_at_host = get_host()
            else:
                owner_at_host = host
            # Persist
            if owner:
                owner_at_host = owner + '@' + owner_at_host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info(
                    '[%s] -submit-num=%d, owner@host=%s',
                    itask, itask.submit_num, owner_at_host)
                self.suite_db_mgr.put_insert_task_jobs(itask, {
                    'is_manual_submit': itask.is_manual_submit,
                    'try_num': itask.get_try_num(),
                    'time_submit': now_str,
                    'user_at_host': owner_at_host,
                    'batch_sys_name': itask.summary['batch_sys_name'],
                })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(
                            self.JOBS_SUBMIT,
                            '(init %s)' % owner_at_host,
                            err=REMOTE_INIT_FAILED,
                            ret_code=1),
                        suite, itask.point, itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [
                    ('host', host, is_remote_host),
                    ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, 'suite job log directory', host, owner))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size] for i in range(0,
                                                        len(itasks),
                                                        chunk_size)]
            LOG.debug(
                '%s ... # will invoke in batches, sizes=%s',
                cmd, [len(b) for b in itasks_batches])
            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            get_task_job_job_log(
                                suite, itask.point, itask.tdef.name,
                                itask.submit_num))
                    job_log_dirs.append(get_task_job_id(
                        itask.point, itask.tdef.name, itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset_state(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)
                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                        **kwargs
                    ),
                    self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks
Beispiel #40
0
 def _reset_job_timers(self, itask):
     """Set up poll timer and timeout for task."""
     if itask.state.status not in TASK_STATUSES_ACTIVE:
         # Reset, task not active
         itask.timeout = None
         itask.poll_timer = None
         return
     ctx = (itask.submit_num, itask.state.status)
     if itask.poll_timer and itask.poll_timer.ctx == ctx:
         return
     # Set poll timer
     # Set timeout
     timeref = None  # reference time, submitted or started time
     timeout = None  # timeout in setting
     if itask.state.status == TASK_STATUS_RUNNING:
         timeref = itask.summary['started_time']
         timeout_key = 'execution timeout'
         timeout = self._get_events_conf(itask, timeout_key)
         delays = list(
             self.get_host_conf(itask,
                                'execution polling intervals',
                                skey='job',
                                default=[900
                                         ]))  # Default 15 minute intervals
         if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]:
             time_limit = itask.summary[self.KEY_EXECUTE_TIME_LIMIT]
             try:
                 host_conf = self.get_host_conf(itask, 'batch systems')
                 batch_sys_conf = host_conf[itask.summary['batch_sys_name']]
             except (TypeError, KeyError):
                 batch_sys_conf = {}
             time_limit_delays = batch_sys_conf.get(
                 'execution time limit polling intervals', [60, 120, 420])
             timeout = time_limit + sum(time_limit_delays)
             # Remove excessive polling before time limit
             while sum(delays) > time_limit:
                 del delays[-1]
             # But fill up the gap before time limit
             if delays:
                 size = int((time_limit - sum(delays)) / delays[-1])
                 delays.extend([delays[-1]] * size)
             time_limit_delays[0] += time_limit - sum(delays)
             delays += time_limit_delays
     else:  # if itask.state.status == TASK_STATUS_SUBMITTED:
         timeref = itask.summary['submitted_time']
         timeout_key = 'submission timeout'
         timeout = self._get_events_conf(itask, timeout_key)
         delays = list(
             self.get_host_conf(itask,
                                'submission polling intervals',
                                skey='job',
                                default=[900
                                         ]))  # Default 15 minute intervals
     try:
         itask.timeout = timeref + float(timeout)
         timeout_str = intvl_as_str(timeout)
     except (TypeError, ValueError):
         itask.timeout = None
         timeout_str = None
     itask.poll_timer = TaskActionTimer(ctx=ctx, delays=delays)
     # Log timeout and polling schedule
     message = 'health check settings: %s=%s' % (timeout_key, timeout_str)
     # Attempt to group identical consecutive delays as N*DELAY,...
     if itask.poll_timer.delays:
         items = []  # [(number of item - 1, item), ...]
         for delay in itask.poll_timer.delays:
             if items and items[-1][1] == delay:
                 items[-1][0] += 1
             else:
                 items.append([0, delay])
         message += ', polling intervals='
         for num, item in items:
             if num:
                 message += '%d*' % (num + 1)
             message += '%s,' % intvl_as_str(item)
         message += '...'
     LOG.info('[%s] -%s', itask, message)
     # Set next poll time
     self.check_poll_time(itask)
Beispiel #41
0
 def _run_event_mail_callback(proc_ctx):
     """Callback the mail command for notification of a suite event."""
     if proc_ctx.ret_code:
         LOG.warning(str(proc_ctx))
     else:
         LOG.info(str(proc_ctx))
Beispiel #42
0
 def load_xtrigger_for_restart(self, row_idx, row):
     """Load satisfied xtrigger results from suite DB."""
     if row_idx == 0:
         LOG.info("LOADING satisfied xtriggers")
     sig, results = row
     self.sat_xtrig[sig] = json.loads(results)
Beispiel #43
0
    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            if (
                self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name']) and
                not is_remote_host(host)
            ):
                owner_at_host = get_host()
            else:
                owner_at_host = host
            # Persist
            if owner:
                owner_at_host = owner + '@' + owner_at_host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info(
                    '[%s] -submit-num=%d, owner@host=%s',
                    itask, itask.submit_num, owner_at_host)
                self.suite_db_mgr.put_insert_task_jobs(itask, {
                    'is_manual_submit': itask.is_manual_submit,
                    'try_num': itask.get_try_num(),
                    'time_submit': now_str,
                    'user_at_host': owner_at_host,
                    'batch_sys_name': itask.summary['batch_sys_name'],
                })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(
                            self.JOBS_SUBMIT,
                            '(init %s)' % owner_at_host,
                            err=REMOTE_INIT_FAILED,
                            ret_code=1),
                        suite, itask.point, itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [
                    ('host', host, is_remote_host),
                    ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, 'suite job log directory', host, owner))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size] for i in range(0,
                                                        len(itasks),
                                                        chunk_size)]
            LOG.debug(
                '%s ... # will invoke in batches, sizes=%s',
                cmd, [len(b) for b in itasks_batches])
            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            get_task_job_job_log(
                                suite, itask.point, itask.tdef.name,
                                itask.submit_num))
                    job_log_dirs.append(get_task_job_id(
                        itask.point, itask.tdef.name, itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset_state(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)
                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                        **kwargs
                    ),
                    self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks