Esempio n. 1
0
 def _process_message_submit_failed(self, itask, event_time):
     """Helper for process_message, handle a submit-failed message."""
     LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED)
     if event_time is None:
         event_time = get_current_time_string()
     self.suite_db_mgr.put_update_task_jobs(
         itask, {
             "time_submit_exit": get_current_time_string(),
             "submit_status": 1,
         })
     itask.summary['submit_method_id'] = None
     if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or
             itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None):
         # No submission retry lined up: definitive failure.
         self.pflag = True
         # See github #476.
         if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED):
             self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED,
                                       'job %s' % self.EVENT_SUBMIT_FAILED)
     else:
         # There is a submission retry lined up.
         timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING]
         delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str()
         msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)
         LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
         itask.set_summary_message(msg)
         if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING):
             self.setup_event_handlers(
                 itask, self.EVENT_SUBMIT_RETRY,
                 "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg))
     self._reset_job_timers(itask)
Esempio n. 2
0
 def load_config(self):
     """Load the suite config."""
     if self.suiterc:
         is_reload = True
         collapsed = self.suiterc.closed_families
     else:
         is_reload = False
         collapsed = []
     try:
         self.suiterc = SuiteConfig(
             self.suite,
             self.file,
             self.template_vars,
             is_reload=is_reload,
             collapsed=collapsed,
             cli_initial_point_string=self.start_point_string,
             vis_start_string=self.start_point_string,
             vis_stop_string=self.stop_point_string)
     except Exception as exc:
         msg = "Failed - parsing error?\n\n%s" % exc
         LOG.error(msg)
         if self.interactive:
             dia = gtk.MessageDialog(type=gtk.MESSAGE_ERROR,
                                     buttons=gtk.BUTTONS_OK,
                                     message_format=msg)
             dia.run()
             dia.destroy()
             return False
         sys.exit(1)
     self.inherit = self.suiterc.get_parent_lists()
     return True
Esempio n. 3
0
 def list_suites(self, regfilter=None):
     """Return a filtered list of valid suite registrations."""
     rec_regfilter = None
     if regfilter:
         try:
             rec_regfilter = re.compile(regfilter)
         except re.error as exc:
             raise ValueError("%s: %s" % (regfilter, exc))
     run_d = glbl_cfg().get_host_item('run directory')
     results = []
     for dirpath, dnames, fnames in os.walk(run_d, followlinks=True):
         # Always descend for top directory, but
         # don't descend further if it has a:
         # * .service/
         # * cylc-suite.db: (pre-cylc-7 suites don't have ".service/").
         if dirpath != run_d and (self.DIR_BASE_SRV in dnames
                                  or "cylc-suite.db" in fnames):
             dnames[:] = []
         # Choose only suites with .service and matching filter
         reg = os.path.relpath(dirpath, run_d)
         path = os.path.join(dirpath, self.DIR_BASE_SRV)
         if (not self._locate_item(self.FILE_BASE_SOURCE, path)
                 or rec_regfilter and not rec_regfilter.search(reg)):
             continue
         try:
             results.append([
                 reg,
                 self.get_suite_source_dir(reg),
                 self.get_suite_title(reg)
             ])
         except (IOError, SuiteServiceFileError) as exc:
             LOG.error('%s: %s', reg, exc)
     return results
Esempio n. 4
0
 def _process_message_submit_failed(self, itask, event_time):
     """Helper for process_message, handle a submit-failed message."""
     LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED)
     if event_time is None:
         event_time = get_current_time_string()
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "time_submit_exit": event_time,
         "submit_status": 1,
     })
     itask.summary['submit_method_id'] = None
     self.pflag = True
     if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or
             itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None):
         # No submission retry lined up: definitive failure.
         # See github #476.
         if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED):
             self.setup_event_handlers(
                 itask, self.EVENT_SUBMIT_FAILED,
                 'job %s' % self.EVENT_SUBMIT_FAILED)
     else:
         # There is a submission retry lined up.
         timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING]
         delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str()
         msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)
         LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
         itask.set_summary_message(msg)
         if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING):
             self.setup_event_handlers(
                 itask, self.EVENT_SUBMIT_RETRY,
                 "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg))
     self._reset_job_timers(itask)
Esempio n. 5
0
 def list_suites(self, regfilter=None):
     """Return a filtered list of valid suite registrations."""
     rec_regfilter = None
     if regfilter:
         try:
             rec_regfilter = re.compile(regfilter)
         except re.error as exc:
             raise ValueError("%s: %s" % (regfilter, exc))
     run_d = glbl_cfg().get_host_item('run directory')
     results = []
     for dirpath, dnames, _ in os.walk(run_d, followlinks=True):
         # Always descend for top directory, but
         # don't descend further if it has a .service/ dir
         if dirpath != run_d and self.DIR_BASE_SRV in dnames:
             dnames[:] = []
         # Choose only suites with .service and matching filter
         reg = os.path.relpath(dirpath, run_d)
         path = os.path.join(dirpath, self.DIR_BASE_SRV)
         if (not self._locate_item(self.FILE_BASE_SOURCE, path) or
                 rec_regfilter and not rec_regfilter.search(reg)):
             continue
         try:
             results.append([
                 reg,
                 self.get_suite_source_dir(reg),
                 self.get_suite_title(reg)])
         except (IOError, SuiteServiceFileError) as exc:
             LOG.error('%s: %s', reg, exc)
     return results
Esempio n. 6
0
 def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc):
     """Helper for self._prep_submit_task_job. On error."""
     LOG.debug("submit_num %s" % itask.submit_num)
     LOG.debug(traceback.format_exc())
     LOG.error(exc)
     log_task_job_activity(SubProcContext(self.JOBS_SUBMIT,
                                          action,
                                          err=exc,
                                          ret_code=1),
                           suite,
                           itask.point,
                           itask.tdef.name,
                           submit_num=itask.submit_num)
     if not dry_run:
         # Persist
         self.suite_db_mgr.put_insert_task_jobs(
             itask, {
                 'is_manual_submit': itask.is_manual_submit,
                 'try_num': itask.get_try_num(),
                 'time_submit': get_current_time_string(),
                 'batch_sys_name': itask.summary.get('batch_sys_name'),
             })
         itask.is_manual_submit = False
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
Esempio n. 7
0
    def clear_broadcast(self,
                        point_strings=None,
                        namespaces=None,
                        cancel_settings=None):
        """Clear broadcasts globally, or for listed namespaces and/or points.

        Return a tuple (modified_settings, bad_options), where:
        * modified_settings is similar to the return value of the "put" method,
          but for removed broadcasts.
        * bad_options is a dict in the form:
              {"point_strings": ["20020202", ..."], ...}
          The dict is only populated if there are options not associated with
          previous broadcasts. The keys can be:
          * point_strings: a list of bad point strings.
          * namespaces: a list of bad namespaces.
          * cancel: a list of tuples. Each tuple contains the keys of a bad
            setting.
        """
        # If cancel_settings defined, only clear specific broadcasts
        cancel_keys_list = self._settings_to_keys_list(cancel_settings)

        # Clear broadcasts
        modified_settings = []
        with self.lock:
            for point_string, point_string_settings in self.broadcasts.items():
                if point_strings and point_string not in point_strings:
                    continue
                for namespace, namespace_settings in (
                        point_string_settings.items()):
                    if namespaces and namespace not in namespaces:
                        continue
                    stuff_stack = [([], namespace_settings)]
                    while stuff_stack:
                        keys, stuff = stuff_stack.pop()
                        for key, value in stuff.items():
                            if isinstance(value, dict):
                                stuff_stack.append((keys + [key], value))
                            elif (not cancel_keys_list
                                  or keys + [key] in cancel_keys_list):
                                stuff[key] = None
                                setting = {key: value}
                                for rkey in reversed(keys):
                                    setting = {rkey: setting}
                                modified_settings.append(
                                    (point_string, namespace, setting))

        # Prune any empty branches
        bad_options = self._get_bad_options(self._prune(), point_strings,
                                            namespaces, cancel_keys_list)

        # Log the broadcast
        self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True)
        LOG.info(get_broadcast_change_report(modified_settings,
                                             is_cancel=True))
        if bad_options:
            LOG.error(get_broadcast_bad_options_report(bad_options))

        return (modified_settings, bad_options)
Esempio n. 8
0
    def clear_broadcast(
            self, point_strings=None, namespaces=None, cancel_settings=None):
        """Clear broadcasts globally, or for listed namespaces and/or points.

        Return a tuple (modified_settings, bad_options), where:
        * modified_settings is similar to the return value of the "put" method,
          but for removed broadcasts.
        * bad_options is a dict in the form:
              {"point_strings": ["20020202", ..."], ...}
          The dict is only populated if there are options not associated with
          previous broadcasts. The keys can be:
          * point_strings: a list of bad point strings.
          * namespaces: a list of bad namespaces.
          * cancel: a list of tuples. Each tuple contains the keys of a bad
            setting.
        """
        # If cancel_settings defined, only clear specific broadcasts
        cancel_keys_list = self._settings_to_keys_list(cancel_settings)

        # Clear broadcasts
        modified_settings = []
        with self.lock:
            for point_string, point_string_settings in self.broadcasts.items():
                if point_strings and point_string not in point_strings:
                    continue
                for namespace, namespace_settings in (
                        point_string_settings.items()):
                    if namespaces and namespace not in namespaces:
                        continue
                    stuff_stack = [([], namespace_settings)]
                    while stuff_stack:
                        keys, stuff = stuff_stack.pop()
                        for key, value in stuff.items():
                            if isinstance(value, dict):
                                stuff_stack.append((keys + [key], value))
                            elif (not cancel_keys_list or
                                    keys + [key] in cancel_keys_list):
                                stuff[key] = None
                                setting = {key: value}
                                for rkey in reversed(keys):
                                    setting = {rkey: setting}
                                modified_settings.append(
                                    (point_string, namespace, setting))

        # Prune any empty branches
        bad_options = self._get_bad_options(
            self._prune(), point_strings, namespaces, cancel_keys_list)

        # Log the broadcast
        self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True)
        LOG.info(
            get_broadcast_change_report(modified_settings, is_cancel=True))
        if bad_options:
            LOG.error(get_broadcast_bad_options_report(bad_options))

        return (modified_settings, bad_options)
Esempio n. 9
0
 def _run_event_handlers_callback(proc_ctx, abort_on_error=False):
     """Callback on completion of a suite event handler."""
     if proc_ctx.ret_code:
         msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1]
         LOG.error(str(proc_ctx))
         LOG.error(msg)
         if abort_on_error:
             raise SuiteEventError(msg)
     else:
         LOG.info(str(proc_ctx))
Esempio n. 10
0
 def _manip_task_jobs_callback(
         self, ctx, suite, itasks, summary_callback, more_callbacks=None):
     """Callback when submit/poll/kill tasks command exits."""
     if ctx.ret_code:
         LOG.error(ctx)
     else:
         LOG.debug(ctx)
     # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy
     #
     # Note for "reload": A TaskProxy instance may be replaced on reload, so
     # the "itasks" list may not reference the TaskProxy objects that
     # replace the old ones. The .reload_successor attribute provides the
     # link(s) for us to get to the latest replacement.
     #
     # Note for "kill": It is possible for a job to trigger its trap and
     # report back to the suite back this logic is called. If so, the task
     # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
     # its output line will be ignored here.
     tasks = {}
     for itask in itasks:
         while itask.reload_successor is not None:
             itask = itask.reload_successor
         if itask.point is not None and itask.submit_num:
             submit_num = "%02d" % (itask.submit_num)
             tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
     handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)]
     if more_callbacks:
         for prefix, callback in more_callbacks.items():
             handlers.append((prefix, callback))
     out = ctx.out
     if not out:
         out = ""
     bad_tasks = dict(tasks)
     for line in out.splitlines(True):
         for prefix, callback in handlers:
             if line.startswith(prefix):
                 line = line[len(prefix):].strip()
                 try:
                     path = line.split("|", 2)[1]  # timestamp, path, status
                     point, name, submit_num = path.split(os.sep, 2)
                     if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY:
                         del bad_tasks[(point, name, submit_num)]
                     itask = tasks[(point, name, submit_num)]
                     callback(suite, itask, ctx, line)
                 except (LookupError, ValueError, KeyError) as exc:
                     LOG.warning(
                         'Unhandled %s output: %s', ctx.cmd_key, line)
                     LOG.exception(exc)
     # Task jobs that are in the original command but did not get a status
     # in the output. Handle as failures.
     for key, itask in sorted(bad_tasks.items()):
         line = (
             "|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n")
         summary_callback(suite, itask, ctx, line)
Esempio n. 11
0
 def _manip_task_jobs_callback(
         self, ctx, suite, itasks, summary_callback, more_callbacks=None):
     """Callback when submit/poll/kill tasks command exits."""
     if ctx.ret_code:
         LOG.error(ctx)
     else:
         LOG.debug(ctx)
     # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy
     #
     # Note for "reload": A TaskProxy instance may be replaced on reload, so
     # the "itasks" list may not reference the TaskProxy objects that
     # replace the old ones. The .reload_successor attribute provides the
     # link(s) for us to get to the latest replacement.
     #
     # Note for "kill": It is possible for a job to trigger its trap and
     # report back to the suite back this logic is called. If so, the task
     # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
     # its output line will be ignored here.
     tasks = {}
     for itask in itasks:
         while itask.reload_successor is not None:
             itask = itask.reload_successor
         if itask.point is not None and itask.submit_num:
             submit_num = "%02d" % (itask.submit_num)
             tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
     handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)]
     if more_callbacks:
         for prefix, callback in more_callbacks.items():
             handlers.append((prefix, callback))
     out = ctx.out
     if not out:
         out = ""
     bad_tasks = dict(tasks)
     for line in out.splitlines(True):
         for prefix, callback in handlers:
             if line.startswith(prefix):
                 line = line[len(prefix):].strip()
                 try:
                     path = line.split("|", 2)[1]  # timestamp, path, status
                     point, name, submit_num = path.split(os.sep, 2)
                     if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY:
                         del bad_tasks[(point, name, submit_num)]
                     itask = tasks[(point, name, submit_num)]
                     callback(suite, itask, ctx, line)
                 except (LookupError, ValueError, KeyError) as exc:
                     LOG.warning(
                         'Unhandled %s output: %s', ctx.cmd_key, line)
                     LOG.exception(exc)
     # Task jobs that are in the original command but did not get a status
     # in the output. Handle as failures.
     for key, itask in sorted(bad_tasks.items()):
         line = (
             "|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n")
         summary_callback(suite, itask, ctx, line)
Esempio n. 12
0
 def _run_event_custom_handlers(self, config, ctx):
     """Helper for "run_event_handlers", custom event handlers."""
     # Look for event handlers
     # 1. Handlers for specific event
     # 2. General handlers
     handlers = self.get_events_conf(config, '%s handler' % ctx.event)
     if not handlers and (ctx.event in self.get_events_conf(
             config, 'handler events', [])):
         handlers = self.get_events_conf(config, 'handlers')
     if not handlers:
         return
     for i, handler in enumerate(handlers):
         cmd_key = ('%s-%02d' % (self.SUITE_EVENT_HANDLER, i), ctx.event)
         # Handler command may be a string for substitution
         abort_on_error = self.get_events_conf(
             config, 'abort if %s handler fails' % ctx.event)
         try:
             handler_data = {
                 'event': quote(ctx.event),
                 'message': quote(ctx.reason),
                 'suite': quote(ctx.suite),
                 'suite_uuid': quote(str(ctx.uuid_str)),
             }
             if config.cfg['meta']:
                 for key, value in config.cfg['meta'].items():
                     if key == "URL":
                         handler_data["suite_url"] = quote(value)
                     handler_data[key] = quote(value)
             cmd = handler % (handler_data)
         except KeyError as exc:
             message = "%s bad template: %s" % (cmd_key, exc)
             LOG.error(message)
             if abort_on_error:
                 raise SuiteEventError(message)
             continue
         if cmd == handler:
             # Nothing substituted, assume classic interface
             cmd = "%s '%s' '%s' '%s'" % (handler, ctx.event, ctx.suite,
                                          ctx.reason)
         proc_ctx = SubProcContext(cmd_key,
                                   cmd,
                                   env=dict(os.environ),
                                   shell=True)
         if abort_on_error or self.proc_pool.closed:
             # Run command in foreground if abort on failure is set or if
             # process pool is closed
             self.proc_pool.run_command(proc_ctx)
             self._run_event_handlers_callback(
                 proc_ctx, abort_on_error=abort_on_error)
         else:
             # Run command using process pool otherwise
             self.proc_pool.put_command(proc_ctx,
                                        self._run_event_handlers_callback)
Esempio n. 13
0
    def restart_upgrade(self):
        """Vacuum/upgrade runtime DB on restart."""
        # Backward compat, upgrade database with state file if necessary
        suite_run_d = os.path.dirname(os.path.dirname(self.pub_path))
        old_pri_db_path = os.path.join(suite_run_d, 'state',
                                       CylcSuiteDAO.OLD_DB_FILE_BASE_NAME)
        old_pri_db_path_611 = os.path.join(
            suite_run_d, CylcSuiteDAO.OLD_DB_FILE_BASE_NAME_611[0])
        old_state_file_path = os.path.join(suite_run_d, "state", "state")
        if (os.path.exists(old_pri_db_path)
                and os.path.exists(old_state_file_path)
                and not os.path.exists(self.pri_path)):
            # Upgrade pre-6.11.X runtime database + state file
            copy(old_pri_db_path, self.pri_path)
            pri_dao = self.get_pri_dao()
            pri_dao.upgrade_with_state_file(old_state_file_path)
            target = os.path.join(suite_run_d, "state.tar.gz")
            cmd = ["tar", "-C", suite_run_d, "-czf", target, "state"]
            if call(cmd, stdin=open(os.devnull)) == 0:
                rmtree(os.path.join(suite_run_d, "state"), ignore_errors=True)
            else:
                try:
                    os.unlink(os.path.join(suite_run_d, "state.tar.gz"))
                except OSError:
                    pass
                LOG.error("cannot tar-gzip + remove old state/ directory")
            # Remove old files as well
            try:
                os.unlink(os.path.join(suite_run_d, "cylc-suite-env"))
            except OSError:
                pass
        elif (os.path.exists(old_pri_db_path_611)
              and not os.path.exists(self.pri_path)):
            # Upgrade 6.11.X runtime database
            os.rename(old_pri_db_path_611, self.pri_path)
            pri_dao = self.get_pri_dao()
            pri_dao.upgrade_from_611()
            # Remove old files as well
            for name in [
                    CylcSuiteDAO.OLD_DB_FILE_BASE_NAME_611[1], "cylc-suite-env"
            ]:
                try:
                    os.unlink(os.path.join(suite_run_d, name))
                except OSError:
                    pass
        else:
            pri_dao = self.get_pri_dao()
            pri_dao.upgrade_pickle_to_json()

        # Vacuum the primary/private database file
        pri_dao.vacuum()
        pri_dao.close()
Esempio n. 14
0
 def _remote_host_select_callback(self, proc_ctx, cmd_str):
     """Callback when host select command exits"""
     self.ready = True
     if proc_ctx.ret_code == 0 and proc_ctx.out:
         # Good status
         LOG.debug(proc_ctx)
         self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0]
     else:
         # Bad status
         LOG.error(proc_ctx)
         self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError(
             TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str,
             proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
Esempio n. 15
0
 def _remote_host_select_callback(self, proc_ctx, cmd_str):
     """Callback when host select command exits"""
     self.ready = True
     if proc_ctx.ret_code == 0 and proc_ctx.out:
         # Good status
         LOG.debug(proc_ctx)
         self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0]
     else:
         # Bad status
         LOG.error(proc_ctx)
         self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError(
             TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str,
             proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
Esempio n. 16
0
    def __init__(self, suite):
        # Suite only needed for back-compat with old clients (see below):
        self.suite = suite
        self.engine = None
        self.port = None

        # Figure out the ports we are allowed to use.
        self.ok_ports = glbl_cfg().get(['suite servers', 'run ports'])
        random.shuffle(self.ok_ports)

        comms_options = glbl_cfg().get(['communication', 'options'])

        # HTTP Digest Auth uses MD5 - pretty secure in this use case.
        # Extending it with extra algorithms is allowed, but won't be
        # supported by most browsers. requests and urllib2 are OK though.
        self.hash_algorithm = "MD5"
        if "SHA1" in comms_options:
            # Note 'SHA' rather than 'SHA1'.
            self.hash_algorithm = "SHA"

        self.srv_files_mgr = SuiteSrvFilesManager()
        self.comms_method = glbl_cfg().get(['communication', 'method'])
        self.get_ha1 = cherrypy.lib.auth_digest.get_ha1_dict_plain(
            {
                'cylc':
                self.srv_files_mgr.get_auth_item(
                    self.srv_files_mgr.FILE_BASE_PASSPHRASE,
                    suite,
                    content=True),
                'anon':
                NO_PASSPHRASE
            },
            algorithm=self.hash_algorithm)
        if self.comms_method == 'http':
            self.cert = None
            self.pkey = None
        else:  # if self.comms_method in [None, 'https']:
            try:
                self.cert = self.srv_files_mgr.get_auth_item(
                    self.srv_files_mgr.FILE_BASE_SSL_CERT, suite)
                self.pkey = self.srv_files_mgr.get_auth_item(
                    self.srv_files_mgr.FILE_BASE_SSL_PEM, suite)
            except SuiteServiceFileError:
                LOG.error("no HTTPS/OpenSSL support. Aborting...")
                raise CylcError("No HTTPS support. "
                                "Configure user's global.rc to use HTTP.")
        self.start()
Esempio n. 17
0
 def _manip_task_jobs_callback(
         self, ctx, suite, itasks, summary_callback, more_callbacks=None):
     """Callback when submit/poll/kill tasks command exits."""
     if ctx.ret_code:
         LOG.error(ctx)
     else:
         LOG.debug(ctx)
     tasks = {}
     # Note for "kill": It is possible for a job to trigger its trap and
     # report back to the suite back this logic is called. If so, the task
     # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
     # its output line will be ignored here.
     for itask in itasks:
         if itask.point is not None and itask.submit_num:
             submit_num = "%02d" % (itask.submit_num)
             tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
     handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)]
     if more_callbacks:
         for prefix, callback in more_callbacks.items():
             handlers.append((prefix, callback))
     out = ctx.out
     if not out:
         out = ""
         # Something is very wrong here
         # Fallback to use "job_log_dirs" list to report the problem
         job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", [])
         for job_log_dir in job_log_dirs:
             point, name, submit_num = job_log_dir.split(os.sep, 2)
             itask = tasks[(point, name, submit_num)]
             out += (self.batch_sys_mgr.OUT_PREFIX_SUMMARY +
                     "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n")
     for line in out.splitlines(True):
         for prefix, callback in handlers:
             if line.startswith(prefix):
                 line = line[len(prefix):].strip()
                 try:
                     path = line.split("|", 2)[1]  # timestamp, path, status
                     point, name, submit_num = path.split(os.sep, 2)
                     itask = tasks[(point, name, submit_num)]
                     callback(suite, itask, ctx, line)
                 except (LookupError, ValueError) as exc:
                     LOG.warning(
                         'Unhandled %s output: %s', ctx.cmd_key, line)
                     LOG.exception(exc)
Esempio n. 18
0
 def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc):
     """Helper for self._prep_submit_task_job. On error."""
     LOG.debug("submit_num %s" % itask.submit_num)
     LOG.debug(traceback.format_exc())
     LOG.error(exc)
     log_task_job_activity(
         SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1),
         suite, itask.point, itask.tdef.name, submit_num=itask.submit_num)
     if not dry_run:
         # Persist
         self.suite_db_mgr.put_insert_task_jobs(itask, {
             'is_manual_submit': itask.is_manual_submit,
             'try_num': itask.get_try_num(),
             'time_submit': get_current_time_string(),
             'batch_sys_name': itask.summary.get('batch_sys_name'),
         })
         itask.is_manual_submit = False
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
Esempio n. 19
0
 def load(self):
     """Load or reload configuration from files."""
     self.sparse.clear()
     self.dense.clear()
     LOG.debug("Loading site/user global config files")
     conf_path_str = os.getenv("CYLC_CONF_PATH")
     if conf_path_str is None:
         # CYLC_CONF_PATH not defined, use default locations.
         for conf_dir_1, conf_dir_2, conf_type in [
             (self.SITE_CONF_DIR, self.SITE_CONF_DIR_OLD,
              upgrader.SITE_CONFIG),
             (self.USER_CONF_DIR_1, self.USER_CONF_DIR_2,
              upgrader.USER_CONFIG)
         ]:
             fname1 = os.path.join(conf_dir_1, self.CONF_BASE)
             fname2 = os.path.join(conf_dir_2, self.CONF_BASE)
             if os.access(fname1, os.F_OK | os.R_OK):
                 fname = fname1
             elif os.access(fname2, os.F_OK | os.R_OK):
                 fname = fname2
             else:
                 continue
             try:
                 self.loadcfg(fname, conf_type)
             except ParsecError as exc:
                 if conf_type == upgrader.SITE_CONFIG:
                     # Warn on bad site file (users can't fix it).
                     LOG.warning('ignoring bad %s %s:\n%s', conf_type,
                                 fname, exc)
                 else:
                     # Abort on bad user file (users can fix it).
                     LOG.error('bad %s %s', conf_type, fname)
                     raise
                 break
     elif conf_path_str:
         # CYLC_CONF_PATH defined with a value
         for path in conf_path_str.split(os.pathsep):
             fname = os.path.join(path, self.CONF_BASE)
             if os.access(fname, os.F_OK | os.R_OK):
                 self.loadcfg(fname, upgrader.USER_CONFIG)
     # (OK if no global.rc is found, just use system defaults).
     self.transform()
Esempio n. 20
0
 def load(self):
     """Load or reload configuration from files."""
     self.sparse.clear()
     self.dense.clear()
     LOG.debug("Loading site/user global config files")
     conf_path_str = os.getenv("CYLC_CONF_PATH")
     if conf_path_str is None:
         # CYLC_CONF_PATH not defined, use default locations.
         for conf_dir_1, conf_dir_2, conf_type in [
                 (self.SITE_CONF_DIR, self.SITE_CONF_DIR_OLD,
                  upgrader.SITE_CONFIG),
                 (self.USER_CONF_DIR_1, self.USER_CONF_DIR_2,
                  upgrader.USER_CONFIG)]:
             fname1 = os.path.join(conf_dir_1, self.CONF_BASE)
             fname2 = os.path.join(conf_dir_2, self.CONF_BASE)
             if os.access(fname1, os.F_OK | os.R_OK):
                 fname = fname1
             elif os.access(fname2, os.F_OK | os.R_OK):
                 fname = fname2
             else:
                 continue
             try:
                 self.loadcfg(fname, conf_type)
             except ParsecError as exc:
                 if conf_type == upgrader.SITE_CONFIG:
                     # Warn on bad site file (users can't fix it).
                     LOG.warning(
                         'ignoring bad %s %s:\n%s', conf_type, fname, exc)
                 else:
                     # Abort on bad user file (users can fix it).
                     LOG.error('bad %s %s', conf_type, fname)
                     raise
                 break
     elif conf_path_str:
         # CYLC_CONF_PATH defined with a value
         for path in conf_path_str.split(os.pathsep):
             fname = os.path.join(path, self.CONF_BASE)
             if os.access(fname, os.F_OK | os.R_OK):
                 self.loadcfg(fname, upgrader.USER_CONFIG)
     # (OK if no global.rc is found, just use system defaults).
     self.transform()
Esempio n. 21
0
 def _remote_init_callback(self, proc_ctx, host, owner, tmphandle):
     """Callback when "cylc remote-init" exits"""
     self.ready = True
     try:
         tmphandle.close()
     except OSError:  # E.g. ignore bad unlink, etc
         pass
     if proc_ctx.ret_code == 0:
         for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED):
             if status in proc_ctx.out:
                 # Good status
                 LOG.debug(proc_ctx)
                 self.remote_init_map[(host, owner)] = status
                 return
     # Bad status
     LOG.error(
         TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_INIT, (host, owner),
                             ' '.join(quote(item) for item in proc_ctx.cmd),
                             proc_ctx.ret_code, proc_ctx.out, proc_ctx.err))
     LOG.error(proc_ctx)
     self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
Esempio n. 22
0
 def _remote_init_callback(self, proc_ctx, host, owner, tmphandle):
     """Callback when "cylc remote-init" exits"""
     self.ready = True
     try:
         tmphandle.close()
     except OSError:  # E.g. ignore bad unlink, etc
         pass
     if proc_ctx.ret_code == 0:
         for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED):
             if status in proc_ctx.out:
                 # Good status
                 LOG.debug(proc_ctx)
                 self.remote_init_map[(host, owner)] = status
                 return
     # Bad status
     LOG.error(TaskRemoteMgmtError(
         TaskRemoteMgmtError.MSG_INIT,
         (host, owner), ' '.join(quote(item) for item in proc_ctx.cmd),
         proc_ctx.ret_code, proc_ctx.out, proc_ctx.err))
     LOG.error(proc_ctx)
     self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
Esempio n. 23
0
def log_task_job_activity(ctx, suite, point, name, submit_num=None):
    """Log an activity for a task job."""
    ctx_str = str(ctx)
    if not ctx_str:
        return
    if isinstance(ctx.cmd_key, tuple):  # An event handler
        submit_num = ctx.cmd_key[-1]
    job_activity_log = get_task_job_activity_log(suite, point, name,
                                                 submit_num)
    try:
        with open(job_activity_log, "ab") as handle:
            handle.write(ctx_str + '\n')
    except IOError as exc:
        # This happens when there is no job directory, e.g. if job host
        # selection command causes an submission failure, there will be no job
        # directory. In this case, just send the information to the suite log.
        LOG.exception(exc)
        LOG.info(ctx_str)
    if ctx.cmd and ctx.ret_code:
        LOG.error(ctx_str)
    elif ctx.cmd:
        LOG.debug(ctx_str)
Esempio n. 24
0
def log_task_job_activity(ctx, suite, point, name, submit_num=None):
    """Log an activity for a task job."""
    ctx_str = str(ctx)
    if not ctx_str:
        return
    if isinstance(ctx.cmd_key, tuple):  # An event handler
        submit_num = ctx.cmd_key[-1]
    job_activity_log = get_task_job_activity_log(
        suite, point, name, submit_num)
    try:
        with open(job_activity_log, "ab") as handle:
            handle.write((ctx_str + '\n').encode())
    except IOError as exc:
        # This happens when there is no job directory, e.g. if job host
        # selection command causes an submission failure, there will be no job
        # directory. In this case, just send the information to the suite log.
        LOG.exception(exc)
        LOG.info(ctx_str)
    if ctx.cmd and ctx.ret_code:
        LOG.error(ctx_str)
    elif ctx.cmd:
        LOG.debug(ctx_str)
Esempio n. 25
0
 def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx):
     """Call back when log job retrieval completes."""
     if proc_ctx.ret_code:
         LOG.error(proc_ctx)
     else:
         LOG.debug(proc_ctx)
     for id_key in proc_ctx.cmd_kwargs["id_keys"]:
         key1, point, name, submit_num = id_key
         try:
             # All completed jobs are expected to have a "job.out".
             fnames = [JOB_LOG_OUT]
             try:
                 if key1[1] not in 'succeeded':
                     fnames.append(JOB_LOG_ERR)
             except TypeError:
                 pass
             fname_oks = {}
             for fname in fnames:
                 fname_oks[fname] = os.path.exists(
                     get_task_job_log(schd_ctx.suite, point, name,
                                      submit_num, fname))
             # All expected paths must exist to record a good attempt
             log_ctx = SubProcContext((key1, submit_num), None)
             if all(fname_oks.values()):
                 log_ctx.ret_code = 0
                 del self.event_timers[id_key]
             else:
                 log_ctx.ret_code = 1
                 log_ctx.err = "File(s) not retrieved:"
                 for fname, exist_ok in sorted(fname_oks.items()):
                     if not exist_ok:
                         log_ctx.err += " %s" % fname
                 self.event_timers[id_key].unset_waiting()
             log_task_job_activity(log_ctx, schd_ctx.suite, point, name,
                                   submit_num)
         except KeyError as exc:
             LOG.exception(exc)
Esempio n. 26
0
 def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx):
     """Call back when log job retrieval completes."""
     if proc_ctx.ret_code:
         LOG.error(proc_ctx)
     else:
         LOG.debug(proc_ctx)
     for id_key in proc_ctx.cmd_kwargs["id_keys"]:
         key1, point, name, submit_num = id_key
         try:
             # All completed jobs are expected to have a "job.out".
             fnames = [JOB_LOG_OUT]
             try:
                 if key1[1] not in 'succeeded':
                     fnames.append(JOB_LOG_ERR)
             except TypeError:
                 pass
             fname_oks = {}
             for fname in fnames:
                 fname_oks[fname] = os.path.exists(get_task_job_log(
                     schd_ctx.suite, point, name, submit_num, fname))
             # All expected paths must exist to record a good attempt
             log_ctx = SubProcContext((key1, submit_num), None)
             if all(fname_oks.values()):
                 log_ctx.ret_code = 0
                 del self.event_timers[id_key]
             else:
                 log_ctx.ret_code = 1
                 log_ctx.err = "File(s) not retrieved:"
                 for fname, exist_ok in sorted(fname_oks.items()):
                     if not exist_ok:
                         log_ctx.err += " %s" % fname
                 self.event_timers[id_key].unset_waiting()
             log_task_job_activity(
                 log_ctx, schd_ctx.suite, point, name, submit_num)
         except KeyError as exc:
             LOG.exception(exc)
Esempio n. 27
0
    def _setup_custom_event_handlers(self, itask, event, message):
        """Set up custom task event handlers."""
        handlers = self._get_events_conf(itask, event + ' handler')
        if (handlers is None and event in self._get_events_conf(
                itask, 'handler events', [])):
            handlers = self._get_events_conf(itask, 'handlers')
        if handlers is None:
            return
        retry_delays = self._get_events_conf(
            itask, 'handler retry delays',
            self.get_host_conf(itask, "task event handler retry delays"))
        if not retry_delays:
            retry_delays = [0]
        # There can be multiple custom event handlers
        for i, handler in enumerate(handlers):
            if event in self.NON_UNIQUE_EVENTS:
                key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), '%s-%d' %
                        (event, itask.non_unique_events.get(event, 1)))
            else:
                key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), event)
            id_key = (key1, str(itask.point), itask.tdef.name,
                      itask.submit_num)
            if id_key in self.event_timers:
                continue
            # Note: user@host may not always be set for a submit number, e.g.
            # on late event or if host select command fails. Use null string to
            # prevent issues in this case.
            user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '')
            if user_at_host and '@' not in user_at_host:
                # (only has 'user@' on the front if user is not suite owner).
                user_at_host = '%s@%s' % (get_user(), user_at_host)
            # Custom event handler can be a command template string
            # or a command that takes 4 arguments (classic interface)
            # Note quote() fails on None, need str(None).
            try:
                handler_data = {
                    "event":
                    quote(event),
                    "suite":
                    quote(self.suite),
                    'suite_uuid':
                    quote(str(self.uuid_str)),
                    "point":
                    quote(str(itask.point)),
                    "name":
                    quote(itask.tdef.name),
                    "submit_num":
                    itask.submit_num,
                    "try_num":
                    itask.get_try_num(),
                    "id":
                    quote(itask.identity),
                    "message":
                    quote(message),
                    "batch_sys_name":
                    quote(str(itask.summary['batch_sys_name'])),
                    "batch_sys_job_id":
                    quote(str(itask.summary['submit_method_id'])),
                    "submit_time":
                    quote(str(itask.summary['submitted_time_string'])),
                    "start_time":
                    quote(str(itask.summary['started_time_string'])),
                    "finish_time":
                    quote(str(itask.summary['finished_time_string'])),
                    "user@host":
                    quote(user_at_host)
                }

                if self.suite_cfg:
                    for key, value in self.suite_cfg.items():
                        if key == "URL":
                            handler_data["suite_url"] = quote(value)
                        else:
                            handler_data["suite_" + key] = quote(value)

                if itask.tdef.rtconfig['meta']:
                    for key, value in itask.tdef.rtconfig['meta'].items():
                        if key == "URL":
                            handler_data["task_url"] = quote(value)
                        handler_data[key] = quote(value)

                cmd = handler % (handler_data)
            except KeyError as exc:
                message = "%s/%s/%02d %s bad template: %s" % (
                    itask.point, itask.tdef.name, itask.submit_num, key1, exc)
                LOG.error(message)
                continue

            if cmd == handler:
                # Nothing substituted, assume classic interface
                cmd = "%s '%s' '%s' '%s' '%s'" % (handler, event, self.suite,
                                                  itask.identity, message)
            LOG.debug("[%s] -Queueing %s handler: %s", itask, event, cmd)
            self.event_timers[id_key] = (TaskActionTimer(
                CustomTaskEventHandlerContext(
                    key1,
                    self.HANDLER_CUSTOM,
                    cmd,
                ), retry_delays))
Esempio n. 28
0
    def _setup_custom_event_handlers(self, itask, event, message):
        """Set up custom task event handlers."""
        handlers = self._get_events_conf(itask, event + ' handler')
        if (handlers is None and
                event in self._get_events_conf(itask, 'handler events', [])):
            handlers = self._get_events_conf(itask, 'handlers')
        if handlers is None:
            return
        retry_delays = self._get_events_conf(
            itask,
            'handler retry delays',
            self.get_host_conf(itask, "task event handler retry delays"))
        if not retry_delays:
            retry_delays = [0]
        # There can be multiple custom event handlers
        for i, handler in enumerate(handlers):
            if event in self.NON_UNIQUE_EVENTS:
                key1 = (
                    '%s-%02d' % (self.HANDLER_CUSTOM, i),
                    '%s-%d' % (event, itask.non_unique_events.get(event, 1)))
            else:
                key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), event)
            id_key = (
                key1, str(itask.point), itask.tdef.name, itask.submit_num)
            if id_key in self.event_timers:
                continue
            # Note: user@host may not always be set for a submit number, e.g.
            # on late event or if host select command fails. Use null string to
            # prevent issues in this case.
            user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '')
            if user_at_host and '@' not in user_at_host:
                # (only has 'user@' on the front if user is not suite owner).
                user_at_host = '%s@%s' % (get_user(), user_at_host)
            # Custom event handler can be a command template string
            # or a command that takes 4 arguments (classic interface)
            # Note quote() fails on None, need str(None).
            try:
                handler_data = {
                    "event": quote(event),
                    "suite": quote(self.suite),
                    'suite_uuid': quote(str(self.uuid_str)),
                    "point": quote(str(itask.point)),
                    "name": quote(itask.tdef.name),
                    "submit_num": itask.submit_num,
                    "try_num": itask.get_try_num(),
                    "id": quote(itask.identity),
                    "message": quote(message),
                    "batch_sys_name": quote(
                        str(itask.summary['batch_sys_name'])),
                    "batch_sys_job_id": quote(
                        str(itask.summary['submit_method_id'])),
                    "submit_time": quote(
                        str(itask.summary['submitted_time_string'])),
                    "start_time": quote(
                        str(itask.summary['started_time_string'])),
                    "finish_time": quote(
                        str(itask.summary['finished_time_string'])),
                    "user@host": quote(user_at_host)
                }

                if self.suite_cfg:
                    for key, value in self.suite_cfg.items():
                        if key == "URL":
                            handler_data["suite_url"] = quote(value)
                        else:
                            handler_data["suite_" + key] = quote(value)

                if itask.tdef.rtconfig['meta']:
                    for key, value in itask.tdef.rtconfig['meta'].items():
                        if key == "URL":
                            handler_data["task_url"] = quote(value)
                        handler_data[key] = quote(value)

                cmd = handler % (handler_data)
            except KeyError as exc:
                message = "%s/%s/%02d %s bad template: %s" % (
                    itask.point, itask.tdef.name, itask.submit_num, key1, exc)
                LOG.error(message)
                continue

            if cmd == handler:
                # Nothing substituted, assume classic interface
                cmd = "%s '%s' '%s' '%s' '%s'" % (
                    handler, event, self.suite, itask.identity, message)
            LOG.debug("[%s] -Queueing %s handler: %s", itask, event, cmd)
            self.event_timers[id_key] = (
                TaskActionTimer(
                    CustomTaskEventHandlerContext(
                        key1,
                        self.HANDLER_CUSTOM,
                        cmd,
                    ),
                    retry_delays))