Example #1
0
 def load_config(self):
     """Load the suite config."""
     if self.suiterc:
         is_reload = True
         collapsed = self.suiterc.closed_families
     else:
         is_reload = False
         collapsed = []
     try:
         self.suiterc = SuiteConfig(
             self.suite,
             self.file,
             self.template_vars,
             is_reload=is_reload,
             collapsed=collapsed,
             cli_initial_point_string=self.start_point_string,
             vis_start_string=self.start_point_string,
             vis_stop_string=self.stop_point_string)
     except Exception as exc:
         msg = "Failed - parsing error?\n\n%s" % exc
         LOG.error(msg)
         if self.interactive:
             dia = gtk.MessageDialog(type=gtk.MESSAGE_ERROR,
                                     buttons=gtk.BUTTONS_OK,
                                     message_format=msg)
             dia.run()
             dia.destroy()
             return False
         sys.exit(1)
     self.inherit = self.suiterc.get_parent_lists()
     return True
Example #2
0
 def _process_message_submit_failed(self, itask, event_time):
     """Helper for process_message, handle a submit-failed message."""
     LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED)
     if event_time is None:
         event_time = get_current_time_string()
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "time_submit_exit": event_time,
         "submit_status": 1,
     })
     itask.summary['submit_method_id'] = None
     self.pflag = True
     if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or
             itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None):
         # No submission retry lined up: definitive failure.
         # See github #476.
         if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED):
             self.setup_event_handlers(
                 itask, self.EVENT_SUBMIT_FAILED,
                 'job %s' % self.EVENT_SUBMIT_FAILED)
     else:
         # There is a submission retry lined up.
         timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING]
         delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str()
         msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)
         LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
         itask.set_summary_message(msg)
         if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING):
             self.setup_event_handlers(
                 itask, self.EVENT_SUBMIT_RETRY,
                 "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg))
     self._reset_job_timers(itask)
Example #3
0
 def load_db_broadcast_states(self, row_idx, row):
     """Load broadcast variables from runtime DB broadcast states row."""
     if row_idx == 0:
         LOG.info("LOADING broadcast states")
     point, namespace, key, value = row
     sections = []
     cur_key = key
     if "]" in cur_key:
         sections = self.REC_SECTION.findall(cur_key)
         cur_key = cur_key.rsplit(r"]", 1)[-1]
     with self.lock:
         self.broadcasts.setdefault(point, {})
         self.broadcasts[point].setdefault(namespace, {})
         dict_ = self.broadcasts[point][namespace]
         for section in sections:
             dict_.setdefault(section, {})
             dict_ = dict_[section]
         dict_[cur_key] = value
     LOG.info(
         CHANGE_FMT.strip() % {
             "change": CHANGE_PREFIX_SET,
             "point": point,
             "namespace": namespace,
             "key": key,
             "value": value
         })
Example #4
0
 def _set_state(self, status):
     """Set, log and record task status (normal change, not forced - don't
     update task_events table)."""
     if self.status == self.hold_swap:
         self.hold_swap = None
     if status == self.status and self.hold_swap is None:
         return
     prev_status, prev_hold_swap = self.status, self.hold_swap
     if status == TASK_STATUS_HELD:
         self.hold_swap = self.status
     elif status in TASK_STATUSES_ACTIVE:
         if self.status == TASK_STATUS_HELD:
             self.hold_swap = TASK_STATUS_HELD
     elif (self.hold_swap == TASK_STATUS_HELD
           and status not in TASK_STATUSES_FINAL):
         self.hold_swap = status
         status = TASK_STATUS_HELD
     elif self.hold_swap:
         self.hold_swap = None
     self.status = status
     self.time_updated = get_current_time_string()
     self.is_updated = True
     # Log
     message = str(prev_status)
     if prev_hold_swap:
         message += " (%s)" % prev_hold_swap
     message += " => %s" % self.status
     if self.hold_swap:
         message += " (%s)" % self.hold_swap
     LOG.debug("[%s] -%s", self.identity, message)
     return (prev_status, prev_hold_swap)
Example #5
0
    def _check_access_priv_and_report(self,
                                      required_privilege_level,
                                      log_info=True):
        """Check access privilege and log requests with identifying info.

        In debug mode log all requests including task messages. Otherwise log
        all user commands, and just the first info command from each client.

        Return:
            dict: containing the client session

        """
        self._check_access_priv(required_privilege_level)
        command = inspect.currentframe().f_back.f_code.co_name
        auth_user, prog_name, user, host, uuid = _get_client_info()
        priv_level = self._get_priv_level(auth_user)
        LOG.debug(self.LOG_CONNECT_ALLOWED_TMPL, user, host, prog_name,
                  priv_level, uuid)
        if cylc.flags.debug or uuid not in self.clients and log_info:
            LOG.info(self.LOG_COMMAND_TMPL, command, user, host, prog_name,
                     uuid)
        self.clients.setdefault(uuid, {})
        self.clients[uuid]['time'] = time()
        self._housekeep()
        return self.clients[uuid]
Example #6
0
 def _process_message_succeeded(self, itask, event_time):
     """Helper for process_message, handle a succeeded message."""
     self.pflag = True
     itask.set_summary_time('finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 0,
         "time_run_exit": event_time,
     })
     # Update mean elapsed time only on task succeeded.
     if itask.summary['started_time'] is not None:
         itask.tdef.elapsed_times.append(
             itask.summary['finished_time'] -
             itask.summary['started_time'])
     if not itask.state.outputs.all_completed():
         msg = ""
         for output in itask.state.outputs.get_not_completed():
             if output not in [TASK_OUTPUT_EXPIRED,
                               TASK_OUTPUT_SUBMIT_FAILED,
                               TASK_OUTPUT_FAILED]:
                 msg += "\n  " + output
         if msg:
             LOG.info(
                 "[%s] -Succeeded with outputs not completed: %s",
                 itask, msg)
     if itask.state.reset_state(TASK_STATUS_SUCCEEDED):
         self.setup_event_handlers(itask, "succeeded", "job succeeded")
     self._reset_job_timers(itask)
Example #7
0
 def _report_connection_if_denied(self):
     """Log an (un?)successful connection attempt."""
     prog_name, user, host, uuid = _get_client_info()[1:]
     connection_denied = self._get_client_connection_denied()
     if connection_denied:
         LOG.warning(self.LOG_CONNECT_DENIED_TMPL, user, host, prog_name,
                     uuid)
Example #8
0
    def _execute_stmt(self, stmt, stmt_args_list):
        """Helper for "self.execute_queued_items".

        Execute a statement. If this is the public database, return True on
        success and False on failure. If this is the private database, return
        True on success, and raise on failure.
        """
        try:
            self.connect()
            self.conn.executemany(stmt, stmt_args_list)
        except sqlite3.Error:
            if not self.is_public:
                raise
            if cylc.flags.debug:
                traceback.print_exc()
            err_log = ("cannot execute database statement:\n"
                       "file=%(file)s:\nstmt=%(stmt)s") % {
                           "file": self.db_file_name,
                           "stmt": stmt
                       }
            for i, stmt_args in enumerate(stmt_args_list):
                err_log += ("\nstmt_args[%(i)d]=%(stmt_args)s" % {
                    "i": i,
                    "stmt_args": stmt_args
                })
            LOG.warning(err_log)
            raise
Example #9
0
 def list_suites(self, regfilter=None):
     """Return a filtered list of valid suite registrations."""
     rec_regfilter = None
     if regfilter:
         try:
             rec_regfilter = re.compile(regfilter)
         except re.error as exc:
             raise ValueError("%s: %s" % (regfilter, exc))
     run_d = glbl_cfg().get_host_item('run directory')
     results = []
     for dirpath, dnames, _ in os.walk(run_d, followlinks=True):
         # Always descend for top directory, but
         # don't descend further if it has a .service/ dir
         if dirpath != run_d and self.DIR_BASE_SRV in dnames:
             dnames[:] = []
         # Choose only suites with .service and matching filter
         reg = os.path.relpath(dirpath, run_d)
         path = os.path.join(dirpath, self.DIR_BASE_SRV)
         if (not self._locate_item(self.FILE_BASE_SOURCE, path) or
                 rec_regfilter and not rec_regfilter.search(reg)):
             continue
         try:
             results.append([
                 reg,
                 self.get_suite_source_dir(reg),
                 self.get_suite_title(reg)])
         except (IOError, SuiteServiceFileError) as exc:
             LOG.error('%s: %s', reg, exc)
     return results
Example #10
0
 def _upgrade_with_state_file_header(self, line):
     """Parse a header line in state file, add information to DB."""
     head, tail = line.split(" : ", 1)
     if head == "time":
         self.add_insert_item(
             self.TABLE_CHECKPOINT_ID, {
                 "id": self.CHECKPOINT_LATEST_ID,
                 "time": tail.split(" ", 1)[0],
                 "event": self.CHECKPOINT_LATEST_EVENT
             })
         return
     for name, key in [("run mode", "run_mode"),
                       ("initial cycle", "initial_point"),
                       ("final cycle", "final_point")]:
         if tail == "None":
             tail = None
         if head == name:
             self.add_insert_item(self.TABLE_SUITE_PARAMS, {
                 "key": key,
                 "value": tail
             })
             LOG.info(" + %s=%s", key, tail)
             if name == "final cycle":
                 return "broadcast"
             else:
                 return
Example #11
0
 def _kill_task_job_callback(self, suite, itask, cmd_ctx, line):
     """Helper for _kill_task_jobs_callback, on one task job."""
     ctx = SubProcContext(self.JOBS_KILL, None)
     ctx.out = line
     try:
         ctx.timestamp, _, ctx.ret_code = line.split("|", 2)
     except ValueError:
         ctx.ret_code = 1
         ctx.cmd = cmd_ctx.cmd  # print original command on failure
     else:
         ctx.ret_code = int(ctx.ret_code)
         if ctx.ret_code:
             ctx.cmd = cmd_ctx.cmd  # print original command on failure
     log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)
     log_lvl = INFO
     log_msg = 'killed'
     if ctx.ret_code:  # non-zero exit status
         log_lvl = WARNING
         log_msg = 'kill failed'
         itask.state.kill_failed = True
     elif itask.state.status == TASK_STATUS_SUBMITTED:
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
             ctx.timestamp)
     elif itask.state.status == TASK_STATUS_RUNNING:
         self.task_events_mgr.process_message(
             itask, CRITICAL, TASK_OUTPUT_FAILED)
     else:
         log_lvl = DEBUG
         log_msg = (
             'ignoring job kill result, unexpected task state: %s' %
             itask.state.status)
     itask.set_summary_message(log_msg)
     LOG.log(log_lvl, "[%s] -job(%02d) %s" % (
         itask.identity, itask.submit_num, log_msg))
Example #12
0
 def _job_cmd_out_callback(suite, itask, cmd_ctx, line):
     """Callback on job command STDOUT/STDERR."""
     if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"):
         owner_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs
     elif cmd_ctx.cmd_kwargs.get("host"):
         owner_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs
     elif cmd_ctx.cmd_kwargs.get("user"):
         owner_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs
     else:
         owner_at_host = ""
     try:
         timestamp, _, content = line.split("|")
     except ValueError:
         pass
     else:
         line = "%s %s" % (timestamp, content)
     job_activity_log = get_task_job_activity_log(
         suite, itask.point, itask.tdef.name)
     try:
         with open(job_activity_log, "ab") as handle:
             if not line.endswith("\n"):
                 line += "\n"
             handle.write((owner_at_host + line).encode())
     except IOError as exc:
         LOG.warning("%s: write failed\n%s" % (job_activity_log, exc))
         LOG.warning("[%s] -%s%s", itask, owner_at_host, line)
Example #13
0
    def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None):
        """Poll jobs of specified tasks.

        Any job that is or was submitted or running can be polled, except for
        retrying tasks - which would poll (correctly) as failed. And don't poll
        succeeded tasks by default.

        This method uses _poll_task_jobs_callback() and
        _manip_task_jobs_callback() as help/callback methods.

        _poll_task_job_callback() executes one specific job.
        """
        to_poll_tasks = []
        pollable_statuses = set([
            TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED])
        if poll_succ:
            pollable_statuses.add(TASK_STATUS_SUCCEEDED)
        for itask in itasks:
            if itask.state.status in pollable_statuses:
                to_poll_tasks.append(itask)
            else:
                LOG.debug("skipping %s: not pollable, "
                          "or skipping 'succeeded' tasks" % itask.identity)
        if to_poll_tasks:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(
                self.JOBS_POLL, suite, to_poll_tasks,
                self._poll_task_jobs_callback)
Example #14
0
 def _job_cmd_out_callback(suite, itask, cmd_ctx, line):
     """Callback on job command STDOUT/STDERR."""
     if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"):
         owner_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs
     elif cmd_ctx.cmd_kwargs.get("host"):
         owner_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs
     elif cmd_ctx.cmd_kwargs.get("user"):
         owner_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs
     else:
         owner_at_host = ""
     try:
         timestamp, _, content = line.split("|")
     except ValueError:
         pass
     else:
         line = "%s %s" % (timestamp, content)
     job_activity_log = get_task_job_activity_log(
         suite, itask.point, itask.tdef.name)
     try:
         with open(job_activity_log, "ab") as handle:
             if not line.endswith("\n"):
                 line += "\n"
             handle.write((owner_at_host + line).encode())
     except IOError as exc:
         LOG.warning("%s: write failed\n%s" % (job_activity_log, exc))
         LOG.warning("[%s] -%s%s", itask, owner_at_host, line)
Example #15
0
 def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx):
     """Call back when log job retrieval completes."""
     for id_key in proc_ctx.cmd_kwargs["id_keys"]:
         key1, point, name, submit_num = id_key
         try:
             # All completed jobs are expected to have a "job.out".
             fnames = [JOB_LOG_OUT]
             try:
                 if key1[1] not in 'succeeded':
                     fnames.append(JOB_LOG_ERR)
             except TypeError:
                 pass
             fname_oks = {}
             for fname in fnames:
                 fname_oks[fname] = os.path.exists(get_task_job_log(
                     schd_ctx.suite, point, name, submit_num, fname))
             # All expected paths must exist to record a good attempt
             log_ctx = SubProcContext((key1, submit_num), None)
             if all(fname_oks.values()):
                 log_ctx.ret_code = 0
                 del self.event_timers[id_key]
             else:
                 log_ctx.ret_code = 1
                 log_ctx.err = "File(s) not retrieved:"
                 for fname, exist_ok in sorted(fname_oks.items()):
                     if not exist_ok:
                         log_ctx.err += " %s" % fname
                 self.event_timers[id_key].unset_waiting()
             log_task_job_activity(
                 log_ctx, schd_ctx.suite, point, name, submit_num)
         except KeyError as exc:
             LOG.exception(exc)
Example #16
0
    def _receiver(self, message):
        """Wrap incoming messages and dispatch them to exposed methods.

        Args:
            message (dict): message contents
        """
        # determine the server method to call
        try:
            method = getattr(self, message['command'])
            args = message['args']
            args.update({'user': message['user']})
            if 'meta' in message:
                args['meta'] = message['meta']
        except KeyError:
            # malformed message
            return {'error': {
                'message': 'Request missing required field(s).'}}
        except AttributeError:
            # no exposed method by that name
            return {'error': {
                'message': 'No method by the name "%s"' % message['command']}}

        # generate response
        try:
            response = method(**args)
        except Exception as exc:
            # includes incorrect arguments (TypeError)
            LOG.exception(exc)  # note the error server side
            import traceback
            return {'error': {
                'message': str(exc), 'traceback': traceback.format_exc()}}

        return {'data': response}
Example #17
0
 def _process_message_failed(self, itask, event_time, message):
     """Helper for process_message, handle a failed message."""
     if event_time is None:
         event_time = get_current_time_string()
     itask.set_summary_time('finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 1,
         "time_run_exit": event_time,
     })
     if (TASK_STATUS_RETRYING not in itask.try_timers
             or itask.try_timers[TASK_STATUS_RETRYING].next() is None):
         # No retry lined up: definitive failure.
         self.pflag = True
         if itask.state.reset_state(TASK_STATUS_FAILED):
             self.setup_event_handlers(itask, "failed", message)
         LOG.critical("[%s] -job(%02d) %s", itask, itask.submit_num,
                      "failed")
     else:
         # There is a retry lined up
         delay_msg = "retrying in %s" % (
             itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str())
         msg = "failed, %s" % (delay_msg)
         LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
         itask.set_summary_message(msg)
         if itask.state.reset_state(TASK_STATUS_RETRYING):
             self.setup_event_handlers(
                 itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg))
     self._reset_job_timers(itask)
Example #18
0
 def _set_state(self, status):
     """Set, log and record task status (normal change, not forced - don't
     update task_events table)."""
     if self.status == self.hold_swap:
         self.hold_swap = None
     if status == self.status and self.hold_swap is None:
         return
     prev_status, prev_hold_swap = self.status, self.hold_swap
     if status == TASK_STATUS_HELD:
         self.hold_swap = self.status
     elif status in TASK_STATUSES_ACTIVE:
         if self.status == TASK_STATUS_HELD:
             self.hold_swap = TASK_STATUS_HELD
     elif (self.hold_swap == TASK_STATUS_HELD and
             status not in TASK_STATUSES_FINAL):
         self.hold_swap = status
         status = TASK_STATUS_HELD
     elif self.hold_swap:
         self.hold_swap = None
     self.status = status
     self.time_updated = get_current_time_string()
     self.is_updated = True
     # Log
     message = str(prev_status)
     if prev_hold_swap:
         message += " (%s)" % prev_hold_swap
     message += " => %s" % self.status
     if self.hold_swap:
         message += " (%s)" % self.hold_swap
     LOG.debug("[%s] -%s", self.identity, message)
     return (prev_status, prev_hold_swap)
Example #19
0
 def check_job_time(self, itask, now):
     """Check/handle job timeout and poll timer"""
     can_poll = self.check_poll_time(itask, now)
     if itask.timeout is None or now <= itask.timeout:
         return can_poll
     # Timeout reached for task, emit event and reset itask.timeout
     if itask.state.status == TASK_STATUS_RUNNING:
         time_ref = itask.summary['started_time']
         event = 'execution timeout'
     elif itask.state.status == TASK_STATUS_SUBMITTED:
         time_ref = itask.summary['submitted_time']
         event = 'submission timeout'
     msg = event
     try:
         msg += ' after %s' % intvl_as_str(itask.timeout - time_ref)
     except (TypeError, ValueError):
         # Badness in time_ref?
         pass
     itask.timeout = None  # emit event only once
     if msg and event:
         LOG.warning('[%s] -%s', itask, msg)
         self.setup_event_handlers(itask, event, msg)
         return True
     else:
         return can_poll
Example #20
0
 def _process_message_submit_failed(self, itask, event_time):
     """Helper for process_message, handle a submit-failed message."""
     LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED)
     if event_time is None:
         event_time = get_current_time_string()
     self.suite_db_mgr.put_update_task_jobs(
         itask, {
             "time_submit_exit": get_current_time_string(),
             "submit_status": 1,
         })
     itask.summary['submit_method_id'] = None
     if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or
             itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None):
         # No submission retry lined up: definitive failure.
         self.pflag = True
         # See github #476.
         if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED):
             self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED,
                                       'job %s' % self.EVENT_SUBMIT_FAILED)
     else:
         # There is a submission retry lined up.
         timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING]
         delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str()
         msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)
         LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
         itask.set_summary_message(msg)
         if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING):
             self.setup_event_handlers(
                 itask, self.EVENT_SUBMIT_RETRY,
                 "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg))
     self._reset_job_timers(itask)
Example #21
0
    def _upgrade_with_state_file_extras(self):
        """Upgrade the database tables after reading in state file."""
        conn = self.connect()

        # Rename old tables
        for t_name in [self.TABLE_TASK_STATES, self.TABLE_TASK_EVENTS]:
            conn.execute(r"ALTER TABLE " + t_name + r" RENAME TO " + t_name +
                         "_old")
        conn.commit()

        # Create tables with new columns
        self.create_tables()

        # Populate new tables using old column data
        for t_name in [self.TABLE_TASK_STATES, self.TABLE_TASK_EVENTS]:
            LOG.info(r"Upgrading %s table", t_name)
            column_names = [col.name for col in self.tables[t_name].columns]
            for i, row in enumerate(
                    conn.execute(r"SELECT " + ",".join(column_names) +
                                 " FROM " + t_name + "_old")):
                # These tables can be big, so we don't want to queue the items
                # in memory.
                conn.execute(self.tables[t_name].get_insert_stmt(), list(row))
        conn.commit()

        # Drop old tables
        for t_name in [self.TABLE_TASK_STATES, self.TABLE_TASK_EVENTS]:
            conn.execute(r"DROP TABLE " + t_name + "_old")
        conn.commit()
Example #22
0
 def check_job_time(self, itask, now):
     """Check/handle job timeout and poll timer"""
     can_poll = self.check_poll_time(itask, now)
     if itask.timeout is None or now <= itask.timeout:
         return can_poll
     # Timeout reached for task, emit event and reset itask.timeout
     if itask.state.status == TASK_STATUS_RUNNING:
         time_ref = itask.summary['started_time']
         event = 'execution timeout'
     elif itask.state.status == TASK_STATUS_SUBMITTED:
         time_ref = itask.summary['submitted_time']
         event = 'submission timeout'
     msg = event
     try:
         msg += ' after %s' % intvl_as_str(itask.timeout - time_ref)
     except (TypeError, ValueError):
         # Badness in time_ref?
         pass
     itask.timeout = None  # emit event only once
     if msg and event:
         LOG.warning('[%s] -%s', itask, msg)
         self.setup_event_handlers(itask, event, msg)
         return True
     else:
         return can_poll
Example #23
0
 def list_suites(self, regfilter=None):
     """Return a filtered list of valid suite registrations."""
     rec_regfilter = None
     if regfilter:
         try:
             rec_regfilter = re.compile(regfilter)
         except re.error as exc:
             raise ValueError("%s: %s" % (regfilter, exc))
     run_d = glbl_cfg().get_host_item('run directory')
     results = []
     for dirpath, dnames, fnames in os.walk(run_d, followlinks=True):
         # Always descend for top directory, but
         # don't descend further if it has a:
         # * .service/
         # * cylc-suite.db: (pre-cylc-7 suites don't have ".service/").
         if dirpath != run_d and (self.DIR_BASE_SRV in dnames
                                  or "cylc-suite.db" in fnames):
             dnames[:] = []
         # Choose only suites with .service and matching filter
         reg = os.path.relpath(dirpath, run_d)
         path = os.path.join(dirpath, self.DIR_BASE_SRV)
         if (not self._locate_item(self.FILE_BASE_SOURCE, path)
                 or rec_regfilter and not rec_regfilter.search(reg)):
             continue
         try:
             results.append([
                 reg,
                 self.get_suite_source_dir(reg),
                 self.get_suite_title(reg)
             ])
         except (IOError, SuiteServiceFileError) as exc:
             LOG.error('%s: %s', reg, exc)
     return results
Example #24
0
 def _process_message_failed(self, itask, event_time, message):
     """Helper for process_message, handle a failed message."""
     if event_time is None:
         event_time = get_current_time_string()
     itask.set_summary_time('finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 1,
         "time_run_exit": event_time,
     })
     if (TASK_STATUS_RETRYING not in itask.try_timers or
             itask.try_timers[TASK_STATUS_RETRYING].next() is None):
         # No retry lined up: definitive failure.
         self.pflag = True
         if itask.state.reset_state(TASK_STATUS_FAILED):
             self.setup_event_handlers(itask, "failed", message)
         LOG.critical(
             "[%s] -job(%02d) %s", itask, itask.submit_num, "failed")
     else:
         # There is a retry lined up
         delay_msg = "retrying in %s" % (
             itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str())
         msg = "failed, %s" % (delay_msg)
         LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
         itask.set_summary_message(msg)
         if itask.state.reset_state(TASK_STATUS_RETRYING):
             self.setup_event_handlers(
                 itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg))
     self._reset_job_timers(itask)
Example #25
0
    def _remove_bad_hosts(self, mock_host_stats=None):
        """Return dictionary of 'good' hosts with their metric stats.

        Run 'get-host-metrics' on each run host in parallel & store extracted
        stats for hosts, else an empty JSON structure. Filter out 'bad' hosts
        whereby either metric data cannot be accessed from the command or at
        least one metric value does not pass a specified threshold.
        """
        if mock_host_stats:  # Create fake data for unittest purposes (only).
            host_stats = dict(mock_host_stats)  # Prevent mutable object issues
        else:
            if not self.hosts:
                return {}
            host_stats = self._get_host_metrics()
        # Analyse get-host-metrics results
        for host, data in list(dict(host_stats).items()):
            if not data:
                # No results for host (command failed) -> skip.
                host_stats.pop(host)
                continue
            for measure, cutoff in self.parsed_thresholds.items():
                datum = data[measure]
                # Cutoff is a minimum or maximum depending on measure context.
                if ((datum > cutoff and measure.startswith("load"))
                        or (datum < cutoff and
                            (measure == "memory"
                             or measure.startswith("disk-space")))):
                    # Alert user that threshold has not been met.
                    LOG.warning(
                        "host '%s' did not pass %s threshold " +
                        "(%s %s threshold %s)\n", host, measure, datum,
                        ">" if measure.startswith("load") else "<", cutoff)
                    host_stats.pop(host)
                    break
        return host_stats
Example #26
0
 def _kill_task_job_callback(self, suite, itask, cmd_ctx, line):
     """Helper for _kill_task_jobs_callback, on one task job."""
     ctx = SubProcContext(self.JOBS_KILL, None)
     ctx.out = line
     try:
         ctx.timestamp, _, ctx.ret_code = line.split("|", 2)
     except ValueError:
         ctx.ret_code = 1
         ctx.cmd = cmd_ctx.cmd  # print original command on failure
     else:
         ctx.ret_code = int(ctx.ret_code)
         if ctx.ret_code:
             ctx.cmd = cmd_ctx.cmd  # print original command on failure
     log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)
     log_lvl = INFO
     log_msg = 'killed'
     if ctx.ret_code:  # non-zero exit status
         log_lvl = WARNING
         log_msg = 'kill failed'
         itask.state.kill_failed = True
     elif itask.state.status == TASK_STATUS_SUBMITTED:
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
             ctx.timestamp)
     elif itask.state.status == TASK_STATUS_RUNNING:
         self.task_events_mgr.process_message(
             itask, CRITICAL, TASK_OUTPUT_FAILED)
     else:
         log_lvl = DEBUG
         log_msg = (
             'ignoring job kill result, unexpected task state: %s' %
             itask.state.status)
     itask.set_summary_message(log_msg)
     LOG.log(log_lvl, "[%s] -job(%02d) %s" % (
         itask.identity, itask.submit_num, log_msg))
Example #27
0
    def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None):
        """Poll jobs of specified tasks.

        Any job that is or was submitted or running can be polled, except for
        retrying tasks - which would poll (correctly) as failed. And don't poll
        succeeded tasks by default.

        This method uses _poll_task_jobs_callback() and
        _manip_task_jobs_callback() as help/callback methods.

        _poll_task_job_callback() executes one specific job.
        """
        to_poll_tasks = []
        pollable_statuses = set([
            TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED])
        if poll_succ:
            pollable_statuses.add(TASK_STATUS_SUCCEEDED)
        for itask in itasks:
            if itask.state.status in pollable_statuses:
                to_poll_tasks.append(itask)
            else:
                LOG.debug("skipping %s: not pollable, "
                          "or skipping 'succeeded' tasks" % itask.identity)
        if to_poll_tasks:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(
                self.JOBS_POLL, suite, to_poll_tasks,
                self._poll_task_jobs_callback)
Example #28
0
 def _process_message_succeeded(self, itask, event_time):
     """Helper for process_message, handle a succeeded message."""
     self.pflag = True
     itask.set_summary_time('finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 0,
         "time_run_exit": event_time,
     })
     # Update mean elapsed time only on task succeeded.
     if itask.summary['started_time'] is not None:
         itask.tdef.elapsed_times.append(itask.summary['finished_time'] -
                                         itask.summary['started_time'])
     if not itask.state.outputs.all_completed():
         msg = ""
         for output in itask.state.outputs.get_not_completed():
             if output not in [
                     TASK_OUTPUT_EXPIRED, TASK_OUTPUT_SUBMIT_FAILED,
                     TASK_OUTPUT_FAILED
             ]:
                 msg += "\n  " + output
         if msg:
             LOG.info("[%s] -Succeeded with outputs not completed: %s",
                      itask, msg)
     if itask.state.reset_state(TASK_STATUS_SUCCEEDED):
         self.setup_event_handlers(itask, "succeeded", "job succeeded")
     self._reset_job_timers(itask)
Example #29
0
    def remote_tidy(self):
        """Remove suite contact files from initialised remotes.

        Call "cylc remote-tidy".
        This method is called on suite shutdown, so we want nothing to hang.
        Timeout any incomplete commands after 10 seconds.

        Also remove UUID file on suite host ".service/uuid".
        """
        # Remove UUID file
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(self.suite),
            FILE_BASE_UUID)
        try:
            os.unlink(uuid_fname)
        except OSError:
            pass
        # Issue all SSH commands in parallel
        procs = {}
        for (host, owner), init_with_contact in self.remote_init_map.items():
            if init_with_contact != REMOTE_INIT_DONE:
                continue
            cmd = ['timeout', '10', 'cylc', 'remote-tidy']
            if is_remote_host(host):
                cmd.append('--host=%s' % host)
            if is_remote_user(owner):
                cmd.append('--user=%s' % owner)
            if cylc.flags.debug:
                cmd.append('--debug')
            cmd.append(os.path.join(glbl_cfg().get_derived_host_item(
                self.suite, 'suite run directory', host, owner)))
            procs[(host, owner)] = (
                cmd,
                Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=open(os.devnull)))
        # Wait for commands to complete for a max of 10 seconds
        timeout = time() + 10.0
        while procs and time() < timeout:
            for (host, owner), (cmd, proc) in procs.copy().items():
                if proc.poll() is None:
                    continue
                del procs[(host, owner)]
                out, err = (f.decode() for f in proc.communicate())
                if proc.wait():
                    LOG.warning(TaskRemoteMgmtError(
                        TaskRemoteMgmtError.MSG_TIDY,
                        (host, owner), ' '.join(quote(item) for item in cmd),
                        proc.returncode, out, err))
        # Terminate any remaining commands
        for (host, owner), (cmd, proc) in procs.items():
            try:
                proc.terminate()
            except OSError:
                pass
            out, err = proc.communicate()
            if proc.wait():
                LOG.warning(TaskRemoteMgmtError(
                    TaskRemoteMgmtError.MSG_TIDY,
                    (host, owner), ' '.join(quote(item) for item in cmd),
                    proc.returncode, out, err))
Example #30
0
    def remote_tidy(self):
        """Remove suite contact files from initialised remotes.

        Call "cylc remote-tidy".
        This method is called on suite shutdown, so we want nothing to hang.
        Timeout any incomplete commands after 10 seconds.

        Also remove UUID file on suite host ".service/uuid".
        """
        # Remove UUID file
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(self.suite),
            FILE_BASE_UUID)
        try:
            os.unlink(uuid_fname)
        except OSError:
            pass
        # Issue all SSH commands in parallel
        procs = {}
        for (host, owner), init_with_contact in self.remote_init_map.items():
            if init_with_contact != REMOTE_INIT_DONE:
                continue
            cmd = ['timeout', '10', 'cylc', 'remote-tidy']
            if is_remote_host(host):
                cmd.append('--host=%s' % host)
            if is_remote_user(owner):
                cmd.append('--user=%s' % owner)
            if cylc.flags.debug:
                cmd.append('--debug')
            cmd.append(os.path.join(glbl_cfg().get_derived_host_item(
                self.suite, 'suite run directory', host, owner)))
            procs[(host, owner)] = (
                cmd,
                Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=open(os.devnull)))
        # Wait for commands to complete for a max of 10 seconds
        timeout = time() + 10.0
        while procs and time() < timeout:
            for (host, owner), (cmd, proc) in procs.copy().items():
                if proc.poll() is None:
                    continue
                del procs[(host, owner)]
                out, err = (f.decode() for f in proc.communicate())
                if proc.wait():
                    LOG.warning(TaskRemoteMgmtError(
                        TaskRemoteMgmtError.MSG_TIDY,
                        (host, owner), ' '.join(quote(item) for item in cmd),
                        proc.returncode, out, err))
        # Terminate any remaining commands
        for (host, owner), (cmd, proc) in procs.items():
            try:
                proc.terminate()
            except OSError:
                pass
            out, err = proc.communicate()
            if proc.wait():
                LOG.warning(TaskRemoteMgmtError(
                    TaskRemoteMgmtError.MSG_TIDY,
                    (host, owner), ' '.join(quote(item) for item in cmd),
                    proc.returncode, out, err))
Example #31
0
 def create_directory(dir_, name):
     """Create directory. Raise GlobalConfigError on error."""
     try:
         os.makedirs(dir_, exist_ok=True)
     except OSError as exc:
         LOG.exception(exc)
         raise GlobalConfigError(
             'Failed to create directory "' + name + '"')
Example #32
0
 def create_directory(dir_, name):
     """Create directory. Raise GlobalConfigError on error."""
     try:
         os.makedirs(dir_, exist_ok=True)
     except OSError as exc:
         LOG.exception(exc)
         raise GlobalConfigError(
             'Failed to create directory "' + name + '"')
Example #33
0
 def create_directory(dir_, name):
     """Create directory. Raise GlobalConfigError on error."""
     try:
         mkdir_p(dir_)
     except OSError as exc:
         LOG.exception(exc)
         raise GlobalConfigError('Failed to create directory "' + name +
                                 '"')
Example #34
0
    def clear_broadcast(self,
                        point_strings=None,
                        namespaces=None,
                        cancel_settings=None):
        """Clear broadcasts globally, or for listed namespaces and/or points.

        Return a tuple (modified_settings, bad_options), where:
        * modified_settings is similar to the return value of the "put" method,
          but for removed broadcasts.
        * bad_options is a dict in the form:
              {"point_strings": ["20020202", ..."], ...}
          The dict is only populated if there are options not associated with
          previous broadcasts. The keys can be:
          * point_strings: a list of bad point strings.
          * namespaces: a list of bad namespaces.
          * cancel: a list of tuples. Each tuple contains the keys of a bad
            setting.
        """
        # If cancel_settings defined, only clear specific broadcasts
        cancel_keys_list = self._settings_to_keys_list(cancel_settings)

        # Clear broadcasts
        modified_settings = []
        with self.lock:
            for point_string, point_string_settings in self.broadcasts.items():
                if point_strings and point_string not in point_strings:
                    continue
                for namespace, namespace_settings in (
                        point_string_settings.items()):
                    if namespaces and namespace not in namespaces:
                        continue
                    stuff_stack = [([], namespace_settings)]
                    while stuff_stack:
                        keys, stuff = stuff_stack.pop()
                        for key, value in stuff.items():
                            if isinstance(value, dict):
                                stuff_stack.append((keys + [key], value))
                            elif (not cancel_keys_list
                                  or keys + [key] in cancel_keys_list):
                                stuff[key] = None
                                setting = {key: value}
                                for rkey in reversed(keys):
                                    setting = {rkey: setting}
                                modified_settings.append(
                                    (point_string, namespace, setting))

        # Prune any empty branches
        bad_options = self._get_bad_options(self._prune(), point_strings,
                                            namespaces, cancel_keys_list)

        # Log the broadcast
        self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True)
        LOG.info(get_broadcast_change_report(modified_settings,
                                             is_cancel=True))
        if bad_options:
            LOG.error(get_broadcast_bad_options_report(bad_options))

        return (modified_settings, bad_options)
Example #35
0
 def upgrade_from_611(self):
     """Upgrade database on restart with a 6.11.X private database."""
     conn = self.connect()
     # Add hold_swap column task_pool(_checkpoints) tables
     for t_name in [self.TABLE_TASK_POOL, self.TABLE_TASK_POOL_CHECKPOINTS]:
         LOG.info("Add hold_swap column to %s", t_name)
         conn.execute(r"ALTER TABLE " + t_name +
                      r" ADD COLUMN hold_swap TEXT")
     conn.commit()
Example #36
0
 def recover_pub_from_pri(self):
     """Recover public database from private database."""
     if self.pub_dao.n_tries >= self.pub_dao.MAX_TRIES:
         self.copy_pri_to_pub()
         LOG.warning(
             "%(pub_db_name)s: recovered from %(pri_db_name)s" % {
                 "pub_db_name": self.pub_dao.db_file_name,
                 "pri_db_name": self.pri_dao.db_file_name})
         self.pub_dao.n_tries = 0
Example #37
0
    def clear_broadcast(
            self, point_strings=None, namespaces=None, cancel_settings=None):
        """Clear broadcasts globally, or for listed namespaces and/or points.

        Return a tuple (modified_settings, bad_options), where:
        * modified_settings is similar to the return value of the "put" method,
          but for removed broadcasts.
        * bad_options is a dict in the form:
              {"point_strings": ["20020202", ..."], ...}
          The dict is only populated if there are options not associated with
          previous broadcasts. The keys can be:
          * point_strings: a list of bad point strings.
          * namespaces: a list of bad namespaces.
          * cancel: a list of tuples. Each tuple contains the keys of a bad
            setting.
        """
        # If cancel_settings defined, only clear specific broadcasts
        cancel_keys_list = self._settings_to_keys_list(cancel_settings)

        # Clear broadcasts
        modified_settings = []
        with self.lock:
            for point_string, point_string_settings in self.broadcasts.items():
                if point_strings and point_string not in point_strings:
                    continue
                for namespace, namespace_settings in (
                        point_string_settings.items()):
                    if namespaces and namespace not in namespaces:
                        continue
                    stuff_stack = [([], namespace_settings)]
                    while stuff_stack:
                        keys, stuff = stuff_stack.pop()
                        for key, value in stuff.items():
                            if isinstance(value, dict):
                                stuff_stack.append((keys + [key], value))
                            elif (not cancel_keys_list or
                                    keys + [key] in cancel_keys_list):
                                stuff[key] = None
                                setting = {key: value}
                                for rkey in reversed(keys):
                                    setting = {rkey: setting}
                                modified_settings.append(
                                    (point_string, namespace, setting))

        # Prune any empty branches
        bad_options = self._get_bad_options(
            self._prune(), point_strings, namespaces, cancel_keys_list)

        # Log the broadcast
        self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True)
        LOG.info(
            get_broadcast_change_report(modified_settings, is_cancel=True))
        if bad_options:
            LOG.error(get_broadcast_bad_options_report(bad_options))

        return (modified_settings, bad_options)
Example #38
0
 def execute_queued_items(self):
     """Execute queued items for each table."""
     try:
         for table in self.tables.values():
             # DELETE statements may have varying number of WHERE args so we
             # can only executemany for each identical template statement.
             for stmt, stmt_args_list in table.delete_queues.items():
                 self._execute_stmt(stmt, stmt_args_list)
             # INSERT statements are uniform for each table, so all INSERT
             # statements can be executed using a single "executemany" call.
             if table.insert_queue:
                 self._execute_stmt(table.get_insert_stmt(),
                                    table.insert_queue)
             # UPDATE statements can have varying number of SET and WHERE
             # args so we can only executemany for each identical template
             # statement.
             for stmt, stmt_args_list in table.update_queues.items():
                 self._execute_stmt(stmt, stmt_args_list)
         # Connection should only be opened if we have executed something.
         if self.conn is None:
             return
         self.conn.commit()
     except sqlite3.Error:
         if not self.is_public:
             raise
         self.n_tries += 1
         LOG.warning(
             "%(file)s: write attempt (%(attempt)d) did not complete\n" % {
                 "file": self.db_file_name,
                 "attempt": self.n_tries
             })
         if self.conn is not None:
             try:
                 self.conn.rollback()
             except sqlite3.Error:
                 pass
         return
     else:
         # Clear the queues
         for table in self.tables.values():
             table.delete_queues.clear()
             del table.insert_queue[:]  # list.clear avail from Python 3.3
             table.update_queues.clear()
         # Report public database retry recovery if necessary
         if self.n_tries:
             LOG.warning(
                 "%(file)s: recovered after (%(attempt)d) attempt(s)\n" % {
                     "file": self.db_file_name,
                     "attempt": self.n_tries
                 })
         self.n_tries = 0
     finally:
         # Note: This is not strictly necessary. However, if the suite run
         # directory is removed, a forced reconnection to the private
         # database will ensure that the suite dies.
         self.close()
Example #39
0
 def satisfy_xclock(self, itask):
     """Attempt to satisfy itask's clock trigger, if it has one."""
     label, sig, ctx, satisfied = self._get_xclock(itask)
     if satisfied:
         return
     if wall_clock(*ctx.func_args, **ctx.func_kwargs):
         satisfied = True
         itask.state.xclock = (label, True)
         self.sat_xclock.append(sig)
         LOG.info('clock xtrigger satisfied: %s = %s' % (label, str(ctx)))
Example #40
0
 def _forget_client(self, uuid):
     """Forget a client."""
     try:
         client_info = self.clients.pop(uuid)
     except KeyError:
         return False
     if client_info.get('err_log_handler') is not None:
         LOG.removeHandler(client_info.get('err_log_handler'))
     LOG.debug(self.LOG_FORGET_TMPL, uuid)
     return True
Example #41
0
 def satisfy_xclock(self, itask):
     """Attempt to satisfy itask's clock trigger, if it has one."""
     label, sig, ctx, satisfied = self._get_xclock(itask)
     if satisfied:
         return
     if wall_clock(*ctx.func_args, **ctx.func_kwargs):
         satisfied = True
         itask.state.xclock = (label, True)
         self.sat_xclock.append(sig)
         LOG.info('clock xtrigger satisfied: %s = %s' % (label, str(ctx)))
Example #42
0
 def _run_event_custom_handlers(self, config, ctx):
     """Helper for "run_event_handlers", custom event handlers."""
     # Look for event handlers
     # 1. Handlers for specific event
     # 2. General handlers
     handlers = self.get_events_conf(config, '%s handler' % ctx.event)
     if not handlers and (ctx.event in self.get_events_conf(
             config, 'handler events', [])):
         handlers = self.get_events_conf(config, 'handlers')
     if not handlers:
         return
     for i, handler in enumerate(handlers):
         cmd_key = ('%s-%02d' % (self.SUITE_EVENT_HANDLER, i), ctx.event)
         # Handler command may be a string for substitution
         abort_on_error = self.get_events_conf(
             config, 'abort if %s handler fails' % ctx.event)
         try:
             handler_data = {
                 'event': quote(ctx.event),
                 'message': quote(ctx.reason),
                 'suite': quote(ctx.suite),
                 'suite_uuid': quote(str(ctx.uuid_str)),
             }
             if config.cfg['meta']:
                 for key, value in config.cfg['meta'].items():
                     if key == "URL":
                         handler_data["suite_url"] = quote(value)
                     handler_data[key] = quote(value)
             cmd = handler % (handler_data)
         except KeyError as exc:
             message = "%s bad template: %s" % (cmd_key, exc)
             LOG.error(message)
             if abort_on_error:
                 raise SuiteEventError(message)
             continue
         if cmd == handler:
             # Nothing substituted, assume classic interface
             cmd = "%s '%s' '%s' '%s'" % (handler, ctx.event, ctx.suite,
                                          ctx.reason)
         proc_ctx = SubProcContext(cmd_key,
                                   cmd,
                                   env=dict(os.environ),
                                   shell=True)
         if abort_on_error or self.proc_pool.closed:
             # Run command in foreground if abort on failure is set or if
             # process pool is closed
             self.proc_pool.run_command(proc_ctx)
             self._run_event_handlers_callback(
                 proc_ctx, abort_on_error=abort_on_error)
         else:
             # Run command using process pool otherwise
             self.proc_pool.put_command(proc_ctx,
                                        self._run_event_handlers_callback)
Example #43
0
    def restart_upgrade(self):
        """Vacuum/upgrade runtime DB on restart."""
        # Backward compat, upgrade database with state file if necessary
        suite_run_d = os.path.dirname(os.path.dirname(self.pub_path))
        old_pri_db_path = os.path.join(suite_run_d, 'state',
                                       CylcSuiteDAO.OLD_DB_FILE_BASE_NAME)
        old_pri_db_path_611 = os.path.join(
            suite_run_d, CylcSuiteDAO.OLD_DB_FILE_BASE_NAME_611[0])
        old_state_file_path = os.path.join(suite_run_d, "state", "state")
        if (os.path.exists(old_pri_db_path)
                and os.path.exists(old_state_file_path)
                and not os.path.exists(self.pri_path)):
            # Upgrade pre-6.11.X runtime database + state file
            copy(old_pri_db_path, self.pri_path)
            pri_dao = self.get_pri_dao()
            pri_dao.upgrade_with_state_file(old_state_file_path)
            target = os.path.join(suite_run_d, "state.tar.gz")
            cmd = ["tar", "-C", suite_run_d, "-czf", target, "state"]
            if call(cmd, stdin=open(os.devnull)) == 0:
                rmtree(os.path.join(suite_run_d, "state"), ignore_errors=True)
            else:
                try:
                    os.unlink(os.path.join(suite_run_d, "state.tar.gz"))
                except OSError:
                    pass
                LOG.error("cannot tar-gzip + remove old state/ directory")
            # Remove old files as well
            try:
                os.unlink(os.path.join(suite_run_d, "cylc-suite-env"))
            except OSError:
                pass
        elif (os.path.exists(old_pri_db_path_611)
              and not os.path.exists(self.pri_path)):
            # Upgrade 6.11.X runtime database
            os.rename(old_pri_db_path_611, self.pri_path)
            pri_dao = self.get_pri_dao()
            pri_dao.upgrade_from_611()
            # Remove old files as well
            for name in [
                    CylcSuiteDAO.OLD_DB_FILE_BASE_NAME_611[1], "cylc-suite-env"
            ]:
                try:
                    os.unlink(os.path.join(suite_run_d, name))
                except OSError:
                    pass
        else:
            pri_dao = self.get_pri_dao()
            pri_dao.upgrade_pickle_to_json()

        # Vacuum the primary/private database file
        pri_dao.vacuum()
        pri_dao.close()
Example #44
0
 def execute_queued_items(self):
     """Execute queued items for each table."""
     try:
         for table in self.tables.values():
             # DELETE statements may have varying number of WHERE args so we
             # can only executemany for each identical template statement.
             for stmt, stmt_args_list in table.delete_queues.items():
                 self._execute_stmt(stmt, stmt_args_list)
             # INSERT statements are uniform for each table, so all INSERT
             # statements can be executed using a single "executemany" call.
             if table.insert_queue:
                 self._execute_stmt(
                     table.get_insert_stmt(), table.insert_queue)
             # UPDATE statements can have varying number of SET and WHERE
             # args so we can only executemany for each identical template
             # statement.
             for stmt, stmt_args_list in table.update_queues.items():
                 self._execute_stmt(stmt, stmt_args_list)
         # Connection should only be opened if we have executed something.
         if self.conn is None:
             return
         self.conn.commit()
     except sqlite3.Error:
         if not self.is_public:
             raise
         self.n_tries += 1
         LOG.warning(
             "%(file)s: write attempt (%(attempt)d) did not complete\n" % {
                 "file": self.db_file_name, "attempt": self.n_tries})
         if self.conn is not None:
             try:
                 self.conn.rollback()
             except sqlite3.Error:
                 pass
         return
     else:
         # Clear the queues
         for table in self.tables.values():
             table.delete_queues.clear()
             del table.insert_queue[:]  # list.clear avail from Python 3.3
             table.update_queues.clear()
         # Report public database retry recovery if necessary
         if self.n_tries:
             LOG.warning(
                 "%(file)s: recovered after (%(attempt)d) attempt(s)\n" % {
                     "file": self.db_file_name, "attempt": self.n_tries})
         self.n_tries = 0
     finally:
         # Note: This is not strictly necessary. However, if the suite run
         # directory is removed, a forced reconnection to the private
         # database will ensure that the suite dies.
         self.close()
Example #45
0
 def _remote_host_select_callback(self, proc_ctx, cmd_str):
     """Callback when host select command exits"""
     self.ready = True
     if proc_ctx.ret_code == 0 and proc_ctx.out:
         # Good status
         LOG.debug(proc_ctx)
         self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0]
     else:
         # Bad status
         LOG.error(proc_ctx)
         self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError(
             TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str,
             proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
Example #46
0
 def _remote_host_select_callback(self, proc_ctx, cmd_str):
     """Callback when host select command exits"""
     self.ready = True
     if proc_ctx.ret_code == 0 and proc_ctx.out:
         # Good status
         LOG.debug(proc_ctx)
         self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0]
     else:
         # Bad status
         LOG.error(proc_ctx)
         self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError(
             TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str,
             proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
Example #47
0
    def test_value_error_raises_system_exit(self, mocked_glbl_cfg):
        """Test that a ValueError when writing to a log stream won't result
        in multiple exceptions (what could lead to infinite loop in some
        occasions. Instead, it **must** raise a SystemExit"""
        with tempfile.NamedTemporaryFile() as tf:
            # mock objects used when creating the file handler
            mocked = mock.MagicMock()
            mocked_glbl_cfg.return_value = mocked
            mocked.get_derived_host_item.return_value = tf.name
            mocked.get.return_value = 100
            file_handler = TimestampRotatingFileHandler("suiteA", False)
            # next line is important as pytest can have a "Bad file descriptor"
            # due to a FileHandler with default "a" (pytest tries to r/w).
            file_handler.mode = "a+"

            # enable the logger
            LOG.setLevel(logging.INFO)
            LOG.addHandler(file_handler)

            # Disable raising uncaught exceptions in logging, due to file
            # handler using stdin.fileno. See the following links for more.
            # https://github.com/pytest-dev/pytest/issues/2276 &
            # https://github.com/pytest-dev/pytest/issues/1585
            logging.raiseExceptions = False

            # first message will initialize the stream and the handler
            LOG.info("What could go")

            # here we change the stream of the handler
            old_stream = file_handler.stream
            file_handler.stream = mock.MagicMock()
            file_handler.stream.seek = mock.MagicMock()
            # in case where
            file_handler.stream.seek.side_effect = ValueError

            try:
                # next call will call the emit method and use the mocked stream
                LOG.info("wrong?!")
                self.fail("Exception SystemError was not raised")
            except SystemExit:
                pass
            finally:
                # clean up
                file_handler.stream = old_stream
                # for log_handler in LOG.handlers:
                #     log_handler.close()
                file_handler.close()
                LOG.removeHandler(file_handler)
                logging.raiseExceptions = True
Example #48
0
        def _authorise(self, *args, user='******', meta=None, **kwargs):
            if not meta:
                meta = {}
            host = meta.get('host', '?')
            prog = meta.get('prog', '?')

            usr_priv_level = self._get_priv_level(user)
            if usr_priv_level < req_priv_level:
                LOG.warn(
                    "[client-connect] DENIED (privilege '%s' < '%s') %s@%s:%s",
                    usr_priv_level, req_priv_level, user, host, prog)
                raise Exception('Authorisation failure')
            LOG.info(
                '[client-command] %s %s@%s:%s', fcn.__name__, user, host, prog)
            return fcn(self, *args, **kwargs)
Example #49
0
 def _event_email_callback(self, proc_ctx, schd_ctx):
     """Call back when email notification command exits."""
     for id_key in proc_ctx.cmd_kwargs["id_keys"]:
         key1, point, name, submit_num = id_key
         try:
             if proc_ctx.ret_code == 0:
                 del self.event_timers[id_key]
                 log_ctx = SubProcContext((key1, submit_num), None)
                 log_ctx.ret_code = 0
                 log_task_job_activity(
                     log_ctx, schd_ctx.suite, point, name, submit_num)
             else:
                 self.event_timers[id_key].unset_waiting()
         except KeyError as exc:
             LOG.exception(exc)
Example #50
0
    def put_broadcast(
            self, point_strings=None, namespaces=None, settings=None):
        """Add new broadcast settings (server side interface).

        Return a tuple (modified_settings, bad_options) where:
          modified_settings is list of modified settings in the form:
            [("20200202", "foo", {"script": "true"}, ...]
          bad_options is as described in the docstring for self.clear().
        """
        modified_settings = []
        bad_point_strings = []
        bad_namespaces = []

        with self.lock:
            for setting in settings:
                for point_string in point_strings:
                    # Standardise the point and check its validity.
                    bad_point = False
                    try:
                        point_string = standardise_point_string(point_string)
                    except PointParsingError:
                        if point_string != '*':
                            bad_point_strings.append(point_string)
                            bad_point = True
                    if not bad_point and point_string not in self.broadcasts:
                        self.broadcasts[point_string] = {}
                    for namespace in namespaces:
                        if namespace not in self.linearized_ancestors:
                            bad_namespaces.append(namespace)
                        elif not bad_point:
                            if namespace not in self.broadcasts[point_string]:
                                self.broadcasts[point_string][namespace] = {}
                            self._addict(
                                self.broadcasts[point_string][namespace],
                                setting)
                            modified_settings.append(
                                (point_string, namespace, setting))

        # Log the broadcast
        self.suite_db_mgr.put_broadcast(modified_settings)
        LOG.info(get_broadcast_change_report(modified_settings))

        bad_options = {}
        if bad_point_strings:
            bad_options["point_strings"] = bad_point_strings
        if bad_namespaces:
            bad_options["namespaces"] = bad_namespaces
        return modified_settings, bad_options
Example #51
0
    def _run_job_cmd(self, cmd_key, suite, itasks, callback):
        """Run job commands, e.g. poll, kill, etc.

        Group itasks with their user@host.
        Put a job command for each user@host to the multiprocess pool.

        """
        if not itasks:
            return
        auth_itasks = {}
        for itask in itasks:
            if (itask.task_host, itask.task_owner) not in auth_itasks:
                auth_itasks[(itask.task_host, itask.task_owner)] = []
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        for (host, owner), itasks in sorted(auth_itasks.items()):
            cmd = ["cylc", cmd_key]
            if LOG.isEnabledFor(DEBUG):
                cmd.append("--debug")
            if is_remote_host(host):
                cmd.append("--host=%s" % (host))
            if is_remote_user(owner):
                cmd.append("--user=%s" % (owner))
            cmd.append("--")
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, "suite job log directory", host, owner))
            job_log_dirs = []
            for itask in sorted(itasks, key=lambda itask: itask.identity):
                job_log_dirs.append(get_task_job_id(
                    itask.point, itask.tdef.name, itask.submit_num))
            cmd += job_log_dirs
            self.proc_pool.put_command(
                SubProcContext(cmd_key, cmd), callback, [suite, itasks])
Example #52
0
    def kill_task_jobs(self, suite, itasks):
        """Kill jobs of active tasks, and hold the tasks.

        If items is specified, kill active tasks matching given IDs.

        """
        to_kill_tasks = []
        for itask in itasks:
            if itask.state.status in TASK_STATUSES_ACTIVE:
                itask.state.set_held()
                to_kill_tasks.append(itask)
            else:
                LOG.warning('skipping %s: task not killable' % itask.identity)
        self._run_job_cmd(
            self.JOBS_KILL, suite, to_kill_tasks,
            self._kill_task_jobs_callback)
Example #53
0
 def _get_job_scripts(itask, rtconfig):
     """Return pre-script, script, post-script for a job."""
     script = rtconfig['script']
     pre_script = rtconfig['pre-script']
     post_script = rtconfig['post-script']
     if itask.tdef.suite_polling_cfg:
         # Automatic suite state polling script
         comstr = "cylc suite-state " + \
                  " --task=" + itask.tdef.suite_polling_cfg['task'] + \
                  " --point=" + str(itask.point)
         if LOG.isEnabledFor(DEBUG):
             comstr += ' --debug'
         for key, fmt in [
                 ('user', ' --%s=%s'),
                 ('host', ' --%s=%s'),
                 ('interval', ' --%s=%d'),
                 ('max-polls', ' --%s=%s'),
                 ('run-dir', ' --%s=%s')]:
             if rtconfig['suite state polling'][key]:
                 comstr += fmt % (key, rtconfig['suite state polling'][key])
         if rtconfig['suite state polling']['message']:
             comstr += " --message='%s'" % (
                 rtconfig['suite state polling']['message'])
         else:
             comstr += " --status=" + itask.tdef.suite_polling_cfg['status']
         comstr += " " + itask.tdef.suite_polling_cfg['suite']
         script = "echo " + comstr + "\n" + comstr
     return pre_script, script, post_script
Example #54
0
    def check_task_jobs(self, suite, task_pool):
        """Check submission and execution timeout and polling timers.

        Poll tasks that have timed out and/or have reached next polling time.
        """
        now = time()
        poll_tasks = set()
        for itask in task_pool.get_tasks():
            if self.task_events_mgr.check_job_time(itask, now):
                poll_tasks.add(itask)
                if itask.poll_timer.delay is not None:
                    LOG.info(
                        '[%s] -poll now, (next in %s)',
                        itask, itask.poll_timer.delay_timeout_as_str())
        if poll_tasks:
            self.poll_task_jobs(suite, poll_tasks)
Example #55
0
    def _process_job_logs_retrieval(self, schd_ctx, ctx, id_keys):
        """Process retrieval of task job logs from remote user@host."""
        if ctx.user_at_host and "@" in ctx.user_at_host:
            s_user, s_host = ctx.user_at_host.split("@", 1)
        else:
            s_user, s_host = (None, ctx.user_at_host)
        ssh_str = str(glbl_cfg().get_host_item("ssh command", s_host, s_user))
        rsync_str = str(glbl_cfg().get_host_item(
            "retrieve job logs command", s_host, s_user))

        cmd = shlex.split(rsync_str) + ["--rsh=" + ssh_str]
        if LOG.isEnabledFor(DEBUG):
            cmd.append("-v")
        if ctx.max_size:
            cmd.append("--max-size=%s" % (ctx.max_size,))
        # Includes and excludes
        includes = set()
        for _, point, name, submit_num in id_keys:
            # Include relevant directories, all levels needed
            includes.add("/%s" % (point))
            includes.add("/%s/%s" % (point, name))
            includes.add("/%s/%s/%02d" % (point, name, submit_num))
            includes.add("/%s/%s/%02d/**" % (point, name, submit_num))
        cmd += ["--include=%s" % (include) for include in sorted(includes)]
        cmd.append("--exclude=/**")  # exclude everything else
        # Remote source
        cmd.append(ctx.user_at_host + ":" + glbl_cfg().get_derived_host_item(
            schd_ctx.suite, "suite job log directory", s_host, s_user) + "/")
        # Local target
        cmd.append(glbl_cfg().get_derived_host_item(
            schd_ctx.suite, "suite job log directory") + "/")
        self.proc_pool.put_command(
            SubProcContext(ctx, cmd, env=dict(os.environ), id_keys=id_keys),
            self._job_logs_retrieval_callback, [schd_ctx])
Example #56
0
    def _process_message_started(self, itask, event_time):
        """Helper for process_message, handle a started message."""
        if itask.job_vacated:
            itask.job_vacated = False
            LOG.warning("[%s] -Vacated job restarted", itask)
        self.pflag = True
        if itask.state.reset_state(TASK_STATUS_RUNNING):
            self.setup_event_handlers(itask, 'started', 'job started')
        itask.set_summary_time('started', event_time)
        self._reset_job_timers(itask)
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "time_run": itask.summary['started_time_string']})

        # submission was successful so reset submission try number
        if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
            itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
Example #57
0
    async def async_request(self, command, args=None, timeout=None):
        """Send an asynchronous request using asyncio.

        Has the same arguments and return values as ``serial_request``.

        """
        if timeout:
            timeout = float(timeout)
        timeout = (timeout * 1000 if timeout else None) or self.timeout
        if not args:
            args = {}

        # get secret for this request
        # assumes secret won't change during the request
        try:
            secret = self.secret()
        except cylc.suite_srv_files_mgr.SuiteServiceFileError:
            raise ClientError('could not read suite passphrase')

        # send message
        msg = {'command': command, 'args': args}
        msg.update(self.header)
        LOG.debug('zmq:send %s' % msg)
        message = encrypt(msg, secret)
        self.socket.send_string(message)

        # receive response
        if self.poller.poll(timeout):
            res = await self.socket.recv_string()
        else:
            if self.timeout_handler:
                self.timeout_handler()
            raise ClientTimeout('Timeout waiting for server response.')

        try:
            response = decrypt(res, secret)
            LOG.debug('zmq:recv %s' % response)
        except jose.exceptions.JWTError:
            raise ClientError(
                'Could not decrypt response. Has the passphrase changed?')

        try:
            return response['data']
        except KeyError:
            error = response['error']
            raise ClientError(error['message'], error.get('traceback'))
Example #58
0
    def callback(self, ctx):
        """Callback for asynchronous xtrigger functions.

        Record satisfaction status and function results dict.

        """
        LOG.debug(ctx)
        sig = ctx.get_signature()
        self.active.remove(sig)
        try:
            satisfied, results = json.loads(ctx.out)
        except (ValueError, TypeError):
            return
        LOG.debug('%s: returned %s' % (sig, results))
        if satisfied:
            self.pflag = True
            self.sat_xtrig[sig] = results
Example #59
0
 def _manip_task_jobs_callback(
         self, ctx, suite, itasks, summary_callback, more_callbacks=None):
     """Callback when submit/poll/kill tasks command exits."""
     if ctx.ret_code:
         LOG.error(ctx)
     else:
         LOG.debug(ctx)
     # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy
     #
     # Note for "reload": A TaskProxy instance may be replaced on reload, so
     # the "itasks" list may not reference the TaskProxy objects that
     # replace the old ones. The .reload_successor attribute provides the
     # link(s) for us to get to the latest replacement.
     #
     # Note for "kill": It is possible for a job to trigger its trap and
     # report back to the suite back this logic is called. If so, the task
     # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
     # its output line will be ignored here.
     tasks = {}
     for itask in itasks:
         while itask.reload_successor is not None:
             itask = itask.reload_successor
         if itask.point is not None and itask.submit_num:
             submit_num = "%02d" % (itask.submit_num)
             tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
     handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)]
     if more_callbacks:
         for prefix, callback in more_callbacks.items():
             handlers.append((prefix, callback))
     out = ctx.out
     if not out:
         out = ""
     bad_tasks = dict(tasks)
     for line in out.splitlines(True):
         for prefix, callback in handlers:
             if line.startswith(prefix):
                 line = line[len(prefix):].strip()
                 try:
                     path = line.split("|", 2)[1]  # timestamp, path, status
                     point, name, submit_num = path.split(os.sep, 2)
                     if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY:
                         del bad_tasks[(point, name, submit_num)]
                     itask = tasks[(point, name, submit_num)]
                     callback(suite, itask, ctx, line)
                 except (LookupError, ValueError, KeyError) as exc:
                     LOG.warning(
                         'Unhandled %s output: %s', ctx.cmd_key, line)
                     LOG.exception(exc)
     # Task jobs that are in the original command but did not get a status
     # in the output. Handle as failures.
     for key, itask in sorted(bad_tasks.items()):
         line = (
             "|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n")
         summary_callback(suite, itask, ctx, line)