def list_suites(self, regfilter=None): """Return a filtered list of valid suite registrations.""" rec_regfilter = None if regfilter: try: rec_regfilter = re.compile(regfilter) except re.error as exc: raise ValueError("%s: %s" % (regfilter, exc)) run_d = glbl_cfg().get_host_item('run directory') results = [] for dirpath, dnames, _ in os.walk(run_d, followlinks=True): # Always descend for top directory, but # don't descend further if it has a .service/ dir if dirpath != run_d and self.DIR_BASE_SRV in dnames: dnames[:] = [] # Choose only suites with .service and matching filter reg = os.path.relpath(dirpath, run_d) path = os.path.join(dirpath, self.DIR_BASE_SRV) if (not self._locate_item(self.FILE_BASE_SOURCE, path) or rec_regfilter and not rec_regfilter.search(reg)): continue try: results.append([ reg, self.get_suite_source_dir(reg), self.get_suite_title(reg)]) except (IOError, SuiteServiceFileError) as exc: LOG.error('%s: %s', reg, exc) return results
def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None): """Poll jobs of specified tasks. Any job that is or was submitted or running can be polled, except for retrying tasks - which would poll (correctly) as failed. And don't poll succeeded tasks by default. This method uses _poll_task_jobs_callback() and _manip_task_jobs_callback() as help/callback methods. _poll_task_job_callback() executes one specific job. """ to_poll_tasks = [] pollable_statuses = set([ TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED]) if poll_succ: pollable_statuses.add(TASK_STATUS_SUCCEEDED) for itask in itasks: if itask.state.status in pollable_statuses: to_poll_tasks.append(itask) else: LOG.debug("skipping %s: not pollable, " "or skipping 'succeeded' tasks" % itask.identity) if to_poll_tasks: if msg is not None: LOG.info(msg) self._run_job_cmd( self.JOBS_POLL, suite, to_poll_tasks, self._poll_task_jobs_callback)
def addict(cfig, key, val, parents, index): """Add a new [parents...]key=value pair to a nested dict.""" for p in parents: # drop down the parent list cfig = cfig[p] if not isinstance(cfig, dict): # an item of this name has already been encountered at this level raise FileParseError( 'line %d: already encountered %s', index, itemstr(parents, key, val)) if key in cfig: # this item already exists if (key == 'graph' and ( parents == ['scheduling', 'dependencies'] or len(parents) == 3 and parents[-3:-1] == ['scheduling', 'dependencies'])): # append the new graph string to the existing one LOG.debug('Merging graph strings under %s', itemstr(parents)) if not isinstance(cfig[key], list): cfig[key] = [cfig[key]] cfig[key].append(val) else: # otherwise override the existing item LOG.debug( 'overriding %s old value: %s new value: %s', itemstr(parents, key), cfig[key], val) cfig[key] = val else: cfig[key] = val
def _job_cmd_out_callback(suite, itask, cmd_ctx, line): """Callback on job command STDOUT/STDERR.""" if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"): owner_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("host"): owner_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("user"): owner_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs else: owner_at_host = "" try: timestamp, _, content = line.split("|") except ValueError: pass else: line = "%s %s" % (timestamp, content) job_activity_log = get_task_job_activity_log( suite, itask.point, itask.tdef.name) try: with open(job_activity_log, "ab") as handle: if not line.endswith("\n"): line += "\n" handle.write((owner_at_host + line).encode()) except IOError as exc: LOG.warning("%s: write failed\n%s" % (job_activity_log, exc)) LOG.warning("[%s] -%s%s", itask, owner_at_host, line)
def _kill_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _kill_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_KILL, None) ctx.out = line try: ctx.timestamp, _, ctx.ret_code = line.split("|", 2) except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) log_lvl = INFO log_msg = 'killed' if ctx.ret_code: # non-zero exit status log_lvl = WARNING log_msg = 'kill failed' itask.state.kill_failed = True elif itask.state.status == TASK_STATUS_SUBMITTED: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp) elif itask.state.status == TASK_STATUS_RUNNING: self.task_events_mgr.process_message( itask, CRITICAL, TASK_OUTPUT_FAILED) else: log_lvl = DEBUG log_msg = ( 'ignoring job kill result, unexpected task state: %s' % itask.state.status) itask.set_summary_message(log_msg) LOG.log(log_lvl, "[%s] -job(%02d) %s" % ( itask.identity, itask.submit_num, log_msg))
def _receiver(self, message): """Wrap incoming messages and dispatch them to exposed methods. Args: message (dict): message contents """ # determine the server method to call try: method = getattr(self, message['command']) args = message['args'] args.update({'user': message['user']}) if 'meta' in message: args['meta'] = message['meta'] except KeyError: # malformed message return {'error': { 'message': 'Request missing required field(s).'}} except AttributeError: # no exposed method by that name return {'error': { 'message': 'No method by the name "%s"' % message['command']}} # generate response try: response = method(**args) except Exception as exc: # includes incorrect arguments (TypeError) LOG.exception(exc) # note the error server side import traceback return {'error': { 'message': str(exc), 'traceback': traceback.format_exc()}} return {'data': response}
def stop(self): """Finish serving the current request then stop the server.""" LOG.debug('stopping zmq server...') self.queue.put('STOP') self.thread.join() # wait for the listener to return self.socket.close() LOG.debug('...stopped')
def remote_tidy(self): """Remove suite contact files from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. Also remove UUID file on suite host ".service/uuid". """ # Remove UUID file uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), FILE_BASE_UUID) try: os.unlink(uuid_fname) except OSError: pass # Issue all SSH commands in parallel procs = {} for (host, owner), init_with_contact in self.remote_init_map.items(): if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['timeout', '10', 'cylc', 'remote-tidy'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flow.flags.debug: cmd.append('--debug') cmd.append(os.path.join(glbl_cfg().get_derived_host_item( self.suite, 'suite run directory', host, owner))) procs[(host, owner)] = ( cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=open(os.devnull))) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err))
def create_directory(dir_, name): """Create directory. Raise GlobalConfigError on error.""" try: os.makedirs(dir_, exist_ok=True) except OSError as exc: LOG.exception(exc) raise GlobalConfigError( 'Failed to create directory "' + name + '"')
def recover_pub_from_pri(self): """Recover public database from private database.""" if self.pub_dao.n_tries >= self.pub_dao.MAX_TRIES: self.copy_pri_to_pub() LOG.warning( "%(pub_db_name)s: recovered from %(pri_db_name)s" % { "pub_db_name": self.pub_dao.db_file_name, "pri_db_name": self.pri_dao.db_file_name}) self.pub_dao.n_tries = 0
def clear_broadcast( self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed broadcasts. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ # If cancel_settings defined, only clear specific broadcasts cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear broadcasts modified_settings = [] with self.lock: for point_string, point_string_settings in self.broadcasts.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options( self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True) LOG.info( get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) return modified_settings, bad_options
def addsect(cfig, sname, parents): """Add a new section to a nested dict.""" for p in parents: # drop down the parent list cfig = cfig[p] if sname in cfig: # this doesn't warrant a warning unless contained items are repeated LOG.debug( 'Section already encountered: %s', itemstr(parents + [sname])) else: cfig[sname] = OrderedDictWithDefaults()
def _run_event_custom_handlers(self, config, ctx): """Helper for "run_event_handlers", custom event handlers.""" # Look for event handlers # 1. Handlers for specific event # 2. General handlers handlers = self.get_events_conf(config, '%s handler' % ctx.event) if not handlers and ( ctx.event in self.get_events_conf(config, 'handler events', [])): handlers = self.get_events_conf(config, 'handlers') if not handlers: return for i, handler in enumerate(handlers): cmd_key = ('%s-%02d' % (self.SUITE_EVENT_HANDLER, i), ctx.event) # Handler command may be a string for substitution abort_on_error = self.get_events_conf( config, 'abort if %s handler fails' % ctx.event) try: handler_data = { 'event': quote(ctx.event), 'message': quote(ctx.reason), 'suite': quote(ctx.suite), 'suite_uuid': quote(str(ctx.uuid_str)), } if config.cfg['meta']: for key, value in config.cfg['meta'].items(): if key == "URL": handler_data["suite_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s bad template: %s" % (cmd_key, exc) LOG.error(message) if abort_on_error: raise SuiteEventError(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s'" % ( handler, ctx.event, ctx.suite, ctx.reason) proc_ctx = SubProcContext( cmd_key, cmd, env=dict(os.environ), shell=True) if abort_on_error or self.proc_pool.closed: # Run command in foreground if abort on failure is set or if # process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback( proc_ctx, abort_on_error=abort_on_error) else: # Run command using process pool otherwise self.proc_pool.put_command( proc_ctx, self._run_event_handlers_callback)
def execute_queued_items(self): """Execute queued items for each table.""" try: for table in self.tables.values(): # DELETE statements may have varying number of WHERE args so we # can only executemany for each identical template statement. for stmt, stmt_args_list in table.delete_queues.items(): self._execute_stmt(stmt, stmt_args_list) # INSERT statements are uniform for each table, so all INSERT # statements can be executed using a single "executemany" call. if table.insert_queue: self._execute_stmt( table.get_insert_stmt(), table.insert_queue) # UPDATE statements can have varying number of SET and WHERE # args so we can only executemany for each identical template # statement. for stmt, stmt_args_list in table.update_queues.items(): self._execute_stmt(stmt, stmt_args_list) # Connection should only be opened if we have executed something. if self.conn is None: return self.conn.commit() except sqlite3.Error: if not self.is_public: raise self.n_tries += 1 LOG.warning( "%(file)s: write attempt (%(attempt)d) did not complete\n" % { "file": self.db_file_name, "attempt": self.n_tries}) if self.conn is not None: try: self.conn.rollback() except sqlite3.Error: pass return else: # Clear the queues for table in self.tables.values(): table.delete_queues.clear() del table.insert_queue[:] # list.clear avail from Python 3.3 table.update_queues.clear() # Report public database retry recovery if necessary if self.n_tries: LOG.warning( "%(file)s: recovered after (%(attempt)d) attempt(s)\n" % { "file": self.db_file_name, "attempt": self.n_tries}) self.n_tries = 0 finally: # Note: This is not strictly necessary. However, if the suite run # directory is removed, a forced reconnection to the private # database will ensure that the suite dies. self.close()
def _remote_host_select_callback(self, proc_ctx, cmd_str): """Callback when host select command exits""" self.ready = True if proc_ctx.ret_code == 0 and proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0] else: # Bad status LOG.error(proc_ctx) self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str, proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
def _remote_init_callback(self, proc_ctx, platform, tmphandle, curve_auth, client_pub_key_dir): """Callback when "cylc remote-init" exits. Write public key for install target into client public key directory. Set remote_init__map status to REMOTE_INIT_DONE on success which in turn will trigger file installation to start. Set remote_init_map status to REMOTE_INIT_FAILED on error. """ with suppress(OSError): # E.g. ignore bad unlink, etc tmphandle.close() install_target = platform['install target'] if proc_ctx.ret_code == 0 and "KEYSTART" in proc_ctx.out: regex_result = re.search('KEYSTART((.|\n|\r)*)KEYEND', proc_ctx.out) key = regex_result.group(1) workflow_srv_dir = get_workflow_srv_dir(self.workflow) public_key = KeyInfo(KeyType.PUBLIC, KeyOwner.CLIENT, workflow_srv_dir=workflow_srv_dir, install_target=install_target) old_umask = os.umask(0o177) with open(public_key.full_key_path, 'w', encoding='utf8') as text_file: text_file.write(key) os.umask(old_umask) # configure_curve must be called every time certificates are # added or removed, in order to update the Authenticator's # state. curve_auth.configure_curve(domain='*', location=(client_pub_key_dir)) self.remote_init_map[install_target] = REMOTE_INIT_DONE self.ready = True return # Bad status LOG.error( PlatformError( PlatformError.MSG_INIT, platform['name'], cmd=proc_ctx.cmd, ret_code=proc_ctx.ret_code, out=proc_ctx.out, err=proc_ctx.err, )) self.remote_init_map[platform['install target']] = REMOTE_INIT_FAILED self.ready = True
def load(config, additional_plugins=None): additional_plugins = additional_plugins or [] entry_points = { entry_point.name: entry_point for entry_point in iter_entry_points('cylc.main_loop') } plugins = { 'state': {}, 'timings': {} } for plugin_name in config['plugins'] + additional_plugins: # get plugin try: module_name = entry_points[plugin_name.replace(' ', '_')] except KeyError: raise UserInputError( f'No main-loop plugin: "{plugin_name}"\n' + ' Available plugins:\n' + indent('\n'.join(sorted(entry_points)), ' ') ) # load plugin try: module = module_name.load() except Exception: raise CylcError(f'Could not load plugin: "{plugin_name}"') # load coroutines log = [] for coro_name, coro in ( (coro_name, coro) for coro_name, coro in getmembers(module) if isfunction(coro) if hasattr(coro, 'main_loop') ): log.append(coro_name) plugins.setdefault( coro.main_loop, {} )[(plugin_name, coro_name)] = coro plugins['timings'][(plugin_name, coro_name)] = deque(maxlen=1) LOG.debug( 'Loaded main loop plugin "%s": %s', plugin_name + '\n', '\n'.join((f'* {x}' for x in log)) ) # set the initial state of the plugin plugins['state'][plugin_name] = {} # make a note of the config here for ease of reference plugins['config'] = config return plugins
async def async_request(self, command, args=None, timeout=None): """Send an asynchronous request using asyncio. Has the same arguments and return values as ``serial_request``. """ if timeout: timeout = float(timeout) timeout = (timeout * 1000 if timeout else None) or self.timeout if not args: args = {} # get secret for this request # assumes secret won't change during the request try: secret = self.secret() except cylc.flow.suite_srv_files_mgr.SuiteServiceFileError: raise ClientError('could not read suite passphrase') # send message msg = {'command': command, 'args': args} msg.update(self.header) LOG.debug('zmq:send %s' % msg) message = encrypt(msg, secret) self.socket.send_string(message) # receive response if self.poller.poll(timeout): res = await self.socket.recv() else: if self.timeout_handler: self.timeout_handler() raise ClientTimeout('Timeout waiting for server response.') if msg['command'] in PB_METHOD_MAP: response = {'data': res} else: try: response = decrypt(res.decode(), secret) except jose.exceptions.JWTError: raise ClientError( 'Could not decrypt response. Has the passphrase changed?') LOG.debug('zmq:recv %s' % response) try: return response['data'] except KeyError: error = response['error'] raise ClientError(error['message'], error.get('traceback'))
def test_value_error_raises_system_exit(self, mocked_glbl_cfg): """Test that a ValueError when writing to a log stream won't result in multiple exceptions (what could lead to infinite loop in some occasions. Instead, it **must** raise a SystemExit""" with tempfile.NamedTemporaryFile() as tf: # mock objects used when creating the file handler mocked = mock.MagicMock() mocked_glbl_cfg.return_value = mocked mocked.get_derived_host_item.return_value = tf.name mocked.get.return_value = 100 file_handler = TimestampRotatingFileHandler("suiteA", False) # next line is important as pytest can have a "Bad file descriptor" # due to a FileHandler with default "a" (pytest tries to r/w). file_handler.mode = "a+" # enable the logger LOG.setLevel(logging.INFO) LOG.addHandler(file_handler) # Disable raising uncaught exceptions in logging, due to file # handler using stdin.fileno. See the following links for more. # https://github.com/pytest-dev/pytest/issues/2276 & # https://github.com/pytest-dev/pytest/issues/1585 logging.raiseExceptions = False # first message will initialize the stream and the handler LOG.info("What could go") # here we change the stream of the handler old_stream = file_handler.stream file_handler.stream = mock.MagicMock() file_handler.stream.seek = mock.MagicMock() # in case where file_handler.stream.seek.side_effect = ValueError try: # next call will call the emit method and use the mocked stream LOG.info("wrong?!") self.fail("Exception SystemError was not raised") except SystemExit: pass finally: # clean up file_handler.stream = old_stream # for log_handler in LOG.handlers: # log_handler.close() file_handler.close() LOG.removeHandler(file_handler) logging.raiseExceptions = True
def call_xtriggers_async(self, itask: TaskProxy): """Call itask's xtrigger functions via the process pool... ...if previous call not still in-process and retry period is up. Args: itask: task proxy to check. """ for label, sig, ctx, _ in self._get_xtrigs(itask, unsat_only=True): if sig.startswith("wall_clock"): # Special case: quick synchronous clock check. if 'absolute_as_seconds' not in ctx.func_kwargs: ctx.func_kwargs.update( {'point_as_seconds': itask.get_point_as_seconds()}) if wall_clock(*ctx.func_args, **ctx.func_kwargs): itask.state.xtriggers[label] = True self.sat_xtrig[sig] = {} self.data_store_mgr.delta_task_xtrigger(sig, True) LOG.info('xtrigger satisfied: %s = %s', label, sig) continue # General case: potentially slow asynchronous function call. if sig in self.sat_xtrig: if not itask.state.xtriggers[label]: itask.state.xtriggers[label] = True res = {} for key, val in self.sat_xtrig[sig].items(): res["%s_%s" % (label, key)] = val if res: xtrigger_env = [{ 'environment': { key: val } } for key, val in res.items()] self.broadcast_mgr.put_broadcast([str(itask.point)], [itask.tdef.name], xtrigger_env) continue if sig in self.active: # Already waiting on this result. continue now = time() if sig in self.t_next_call and now < self.t_next_call[sig]: # Too soon to call this one again. continue self.t_next_call[sig] = now + ctx.intvl # Queue to the process pool, and record as active. self.active.append(sig) self.proc_pool.put_command(ctx, callback=self.callback)
def stop(self, stop_loop=True): """Stop the server. Args: stop_loop (Boolean): Stop running IOLoop of current thread. """ self._bespoke_stop() if stop_loop and self.loop and self.loop.is_running(): self.loop.stop() if self.thread and self.thread.is_alive(): self.thread.join() # Wait for processes to return if self.socket and not self.socket.closed: self.socket.close() LOG.debug('...stopped')
def run_reftest(config, ctx): """Run reference test at shutdown.""" reffilename = config.get_ref_log_name() curfilename = get_workflow_test_log_name(ctx.workflow) ref = _load_reflog(reffilename) cur = _load_reflog(curfilename) if ref == cur: LOG.info('WORKFLOW REFERENCE TEST PASSED') else: exc = WorkflowEventError( 'WORKFLOW REFERENCE TEST FAILED\n' 'triggering is NOT consistent with the reference log:\n%s\n' % '\n'.join(unified_diff(ref, cur, 'reference', 'this run'))) LOG.exception(exc) raise exc
def _event_email_callback(self, proc_ctx, schd_ctx): """Call back when email notification command exits.""" for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: if proc_ctx.ret_code == 0: del self.event_timers[id_key] log_ctx = SubProcContext((key1, submit_num), None) log_ctx.ret_code = 0 log_task_job_activity(log_ctx, schd_ctx.suite, point, name, submit_num) else: self.event_timers[id_key].unset_waiting() except KeyError as exc: LOG.exception(exc)
def _open_logs(id_, no_detach): """Open Cylc log handlers for a flow run.""" if not no_detach: while LOG.handlers: LOG.handlers[0].close() LOG.removeHandler(LOG.handlers[0]) log_path = get_workflow_run_log_name(id_) LOG.addHandler( TimestampRotatingFileHandler(log_path, no_detach) ) # Add file installation log file_install_log_path = get_workflow_file_install_log_name(id_) RSYNC_LOG.addHandler( TimestampRotatingFileHandler(file_install_log_path, no_detach) )
def kill_task_jobs(self, suite, itasks): """Kill jobs of active tasks, and hold the tasks. If items is specified, kill active tasks matching given IDs. """ to_kill_tasks = [] for itask in itasks: if itask.state(*TASK_STATUSES_ACTIVE): itask.state.reset(is_held=True) to_kill_tasks.append(itask) else: LOG.warning('skipping %s: task not killable' % itask.identity) self._run_job_cmd(self.JOBS_KILL, suite, to_kill_tasks, self._kill_task_jobs_callback)
def check_task_jobs(self, suite, task_pool): """Check submission and execution timeout and polling timers. Poll tasks that have timed out and/or have reached next polling time. """ now = time() poll_tasks = set() for itask in task_pool.get_tasks(): if self.task_events_mgr.check_job_time(itask, now): poll_tasks.add(itask) if itask.poll_timer.delay is not None: LOG.info('[%s] -poll now, (next in %s)', itask, itask.poll_timer.delay_timeout_as_str()) if poll_tasks: self.poll_task_jobs(suite, poll_tasks)
def _authorise(self, *args, user='******', meta=None, **kwargs): if not meta: meta = {} host = meta.get('host', '?') prog = meta.get('prog', '?') usr_priv_level = self._get_priv_level(user) if usr_priv_level < req_priv_level: LOG.warn( "[client-connect] DENIED (privilege '%s' < '%s') %s@%s:%s", usr_priv_level, req_priv_level, user, host, prog) raise Exception('Authorisation failure') LOG.info('[client-command] %s %s@%s:%s', fcn.__name__, user, host, prog) return fcn(self, *args, **kwargs)
def _authorise(self, *args, user='******', meta=None, **kwargs): if not meta: meta = {} host = meta.get('host', '?') prog = meta.get('prog', '?') usr_priv_level = self._get_priv_level(user) if usr_priv_level < req_priv_level: LOG.warn( "[client-connect] DENIED (privilege '%s' < '%s') %s@%s:%s", usr_priv_level, req_priv_level, user, host, prog) raise Exception('Authorisation failure') LOG.info( '[client-command] %s %s@%s:%s', fcn.__name__, user, host, prog) return fcn(self, *args, **kwargs)
def process_mail_footer( mail_footer_tmpl: str, template_vars, ) -> str: """Process mail footer for workflow or task events. Returns an empty string if issues occur in processing. """ try: return (mail_footer_tmpl + '\n') % template_vars except (KeyError, ValueError): LOG.warning( f'Ignoring bad mail footer template: {mail_footer_tmpl}' ) return ''
def upgrade_retry_state(self): """Replace the retry state with xtriggers. * Change *retrying tasks to waiting * Add the required xtrigger Note: The retry status can be safely removed as this is really a display state, the retry logic revolves around the TaskActionTimer. From: cylc<8 To: cylc>=8 PR: #3423 Returns: list - (cycle, name, status) tuples of all retrying tasks. """ conn = self.connect() for table in [self.TABLE_TASK_POOL_CHECKPOINTS, self.TABLE_TASK_POOL]: tasks = list( conn.execute(rf''' SELECT cycle, name, status FROM {table} WHERE status IN ('retrying', 'submit-retrying') ''')) if tasks: LOG.info(f'Upgrade retrying tasks in table {table}') conn.executemany( rf''' UPDATE {table} SET status='{TASK_STATUS_WAITING}' WHERE cycle==? and name==? and status==? ''', tasks) conn.commit() return tasks
def parse_args(self, api_args, remove_opts=None): """Parse options and arguments, overrides OptionParser.parse_args. Args: api_args (list): Command line options if passed via Python as opposed to sys.argv remove_opts (list): List of standard options to remove before parsing. """ if self.auto_add: # Add common options after command-specific options. self.add_std_options() if remove_opts: for opt in remove_opts: try: self.remove_option(opt) except ValueError: pass (options, args) = OptionParser.parse_args(self, api_args) if len(args) < self.n_compulsory_args: self.error("Wrong number of arguments (too few)") elif not self.unlimited_args and \ len(args) > self.n_compulsory_args + self.n_optional_args: self.error("Wrong number of arguments (too many)") if self.jset: if options.templatevars_file: options.templatevars_file = os.path.abspath(os.path.expanduser( options.templatevars_file)) cylc.flow.flags.verbosity = options.verbosity # Set up stream logging for CLI. Note: # 1. On choosing STDERR: Log messages are diagnostics, so STDERR is the # better choice for the logging stream. This allows us to use STDOUT # for verbosity agnostic outputs. # 2. Scheduler will remove this handler when it becomes a daemon. if options.verbosity > 1: LOG.setLevel(logging.DEBUG) else: LOG.setLevel(logging.INFO) # Remove NullHandler before add the StreamHandler RSYNC_LOG.setLevel(logging.INFO) while LOG.handlers: LOG.handlers[0].close() LOG.removeHandler(LOG.handlers[0]) errhandler = logging.StreamHandler(sys.stderr) errhandler.setFormatter(CylcLogFormatter( timestamp=options.log_timestamp)) LOG.addHandler(errhandler) return (options, args)
def test_value_error_raises_system_exit(mocked_glbl_cfg, ): """Test that a ValueError when writing to a log stream won't result in multiple exceptions (what could lead to infinite loop in some occasions. Instead, it **must** raise a SystemExit""" with tempfile.NamedTemporaryFile() as tf: # mock objects used when creating the file handler mocked = mock.MagicMock() mocked_glbl_cfg.return_value = mocked mocked.get.return_value = 100 file_handler = TimestampRotatingFileHandler(tf.name, False) # next line is important as pytest can have a "Bad file descriptor" # due to a FileHandler with default "a" (pytest tries to r/w). file_handler.mode = "a+" # enable the logger LOG.setLevel(logging.INFO) LOG.addHandler(file_handler) # Disable raising uncaught exceptions in logging, due to file # handler using stdin.fileno. See the following links for more. # https://github.com/pytest-dev/pytest/issues/2276 & # https://github.com/pytest-dev/pytest/issues/1585 logging.raiseExceptions = False # first message will initialize the stream and the handler LOG.info("What could go") # here we change the stream of the handler old_stream = file_handler.stream file_handler.stream = mock.MagicMock() file_handler.stream.seek = mock.MagicMock() # in case where file_handler.stream.seek.side_effect = ValueError try: # next call will call the emit method and use the mocked stream LOG.info("wrong?!") raise Exception("Exception SystemError was not raised") except SystemExit: pass finally: # clean up file_handler.stream = old_stream # for log_handler in LOG.handlers: # log_handler.close() file_handler.close() LOG.removeHandler(file_handler) logging.raiseExceptions = True
def put_broadcast( self, point_strings=None, namespaces=None, settings=None): """Add new broadcast settings (server side interface). Return a tuple (modified_settings, bad_options) where: modified_settings is list of modified settings in the form: [("20200202", "foo", {"script": "true"}, ...] bad_options is as described in the docstring for self.clear(). """ modified_settings = [] bad_point_strings = [] bad_namespaces = [] with self.lock: for setting in settings: for point_string in point_strings: # Standardise the point and check its validity. bad_point = False try: point_string = standardise_point_string(point_string) except PointParsingError: if point_string != '*': bad_point_strings.append(point_string) bad_point = True if not bad_point and point_string not in self.broadcasts: self.broadcasts[point_string] = {} for namespace in namespaces: if namespace not in self.linearized_ancestors: bad_namespaces.append(namespace) elif not bad_point: if namespace not in self.broadcasts[point_string]: self.broadcasts[point_string][namespace] = {} self._addict( self.broadcasts[point_string][namespace], setting) modified_settings.append( (point_string, namespace, setting)) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings) LOG.info(get_broadcast_change_report(modified_settings)) bad_options = {} if bad_point_strings: bad_options["point_strings"] = bad_point_strings if bad_namespaces: bad_options["namespaces"] = bad_namespaces return modified_settings, bad_options
def check_task_jobs(self, suite, task_pool): """Check submission and execution timeout and polling timers. Poll tasks that have timed out and/or have reached next polling time. """ now = time() poll_tasks = set() for itask in task_pool.get_tasks(): if self.task_events_mgr.check_job_time(itask, now): poll_tasks.add(itask) if itask.poll_timer.delay is not None: LOG.info( '[%s] -poll now, (next in %s)', itask, itask.poll_timer.delay_timeout_as_str()) if poll_tasks: self.poll_task_jobs(suite, poll_tasks)
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if LOG.isEnabledFor(DEBUG): cmd.append("--debug") if is_remote_host(host): cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append(get_remote_suite_run_job_dir(host, owner, suite)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command(SubProcContext(cmd_key, cmd), callback, [suite, itasks])
def get_new_flow(self, description: Optional[str] = None) -> int: """Increment flow counter, record flow metadata.""" self.counter += 1 # record start time to nearest second now = datetime.datetime.now() now_sec: str = str(now - datetime.timedelta(microseconds=now.microsecond)) description = description or "no description" self.flows[self.counter] = { "description": description, "start_time": now_sec } LOG.info(f"New flow: {self.counter} " f"({description}) " f"{now_sec}") self.db_mgr.put_insert_workflow_flows(self.counter, self.flows[self.counter]) return self.counter
def _process_job_logs_retrieval(self, schd_ctx, ctx, id_keys): """Process retrieval of task job logs from remote user@host.""" if ctx.user_at_host and "@" in ctx.user_at_host: s_user, s_host = ctx.user_at_host.split("@", 1) else: s_user, s_host = (None, ctx.user_at_host) ssh_str = str(glbl_cfg().get_host_item("ssh command", s_host, s_user)) rsync_str = str(glbl_cfg().get_host_item("retrieve job logs command", s_host, s_user)) cmd = shlex.split(rsync_str) + ["--rsh=" + ssh_str] if LOG.isEnabledFor(DEBUG): cmd.append("-v") if ctx.max_size: cmd.append("--max-size=%s" % (ctx.max_size, )) # Includes and excludes includes = set() for _, point, name, submit_num in id_keys: # Include relevant directories, all levels needed includes.add("/%s" % (point)) includes.add("/%s/%s" % (point, name)) includes.add("/%s/%s/%02d" % (point, name, submit_num)) includes.add("/%s/%s/%02d/**" % (point, name, submit_num)) cmd += ["--include=%s" % (include) for include in sorted(includes)] cmd.append("--exclude=/**") # exclude everything else # Remote source cmd.append( "%s:%s/" % (ctx.user_at_host, get_remote_suite_run_job_dir(s_host, s_user, schd_ctx.suite))) # Local target cmd.append(get_suite_run_job_dir(schd_ctx.suite) + "/") self.proc_pool.put_command( SubProcContext(ctx, cmd, env=dict(os.environ), id_keys=id_keys), self._job_logs_retrieval_callback, [schd_ctx])
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if LOG.isEnabledFor(DEBUG): cmd.append("--debug") if is_remote_host(host): cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append(glbl_cfg().get_derived_host_item( suite, "suite job log directory", host, owner)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command( SubProcContext(cmd_key, cmd), callback, [suite, itasks])
def _get_job_scripts(itask, rtconfig): """Return pre-script, script, post-script for a job.""" script = rtconfig['script'] pre_script = rtconfig['pre-script'] post_script = rtconfig['post-script'] if itask.tdef.suite_polling_cfg: # Automatic suite state polling script comstr = "cylc suite-state " + \ " --task=" + itask.tdef.suite_polling_cfg['task'] + \ " --point=" + str(itask.point) if LOG.isEnabledFor(DEBUG): comstr += ' --debug' for key, fmt in [('user', ' --%s=%s'), ('host', ' --%s=%s'), ('interval', ' --%s=%d'), ('max-polls', ' --%s=%s'), ('run-dir', ' --%s=%s')]: if rtconfig['suite state polling'][key]: comstr += fmt % (key, rtconfig['suite state polling'][key]) if rtconfig['suite state polling']['message']: comstr += " --message='%s'" % ( rtconfig['suite state polling']['message']) else: comstr += " --status=" + itask.tdef.suite_polling_cfg['status'] comstr += " " + itask.tdef.suite_polling_cfg['suite'] script = "echo " + comstr + "\n" + comstr return pre_script, script, post_script
def _get_job_scripts(itask, rtconfig): """Return pre-script, script, post-script for a job.""" script = rtconfig['script'] pre_script = rtconfig['pre-script'] post_script = rtconfig['post-script'] if itask.tdef.suite_polling_cfg: # Automatic suite state polling script comstr = "cylc suite-state " + \ " --task=" + itask.tdef.suite_polling_cfg['task'] + \ " --point=" + str(itask.point) if LOG.isEnabledFor(DEBUG): comstr += ' --debug' for key, fmt in [ ('user', ' --%s=%s'), ('host', ' --%s=%s'), ('interval', ' --%s=%d'), ('max-polls', ' --%s=%s'), ('run-dir', ' --%s=%s')]: if rtconfig['suite state polling'][key]: comstr += fmt % (key, rtconfig['suite state polling'][key]) if rtconfig['suite state polling']['message']: comstr += " --message='%s'" % ( rtconfig['suite state polling']['message']) else: comstr += " --status=" + itask.tdef.suite_polling_cfg['status'] comstr += " " + itask.tdef.suite_polling_cfg['suite'] script = "echo " + comstr + "\n" + comstr return pre_script, script, post_script
def kill_task_jobs(self, suite, itasks): """Kill jobs of active tasks, and hold the tasks. If items is specified, kill active tasks matching given IDs. """ to_kill_tasks = [] for itask in itasks: if itask.state.status in TASK_STATUSES_ACTIVE: itask.state.set_held() to_kill_tasks.append(itask) else: LOG.warning('skipping %s: task not killable' % itask.identity) self._run_job_cmd( self.JOBS_KILL, suite, to_kill_tasks, self._kill_task_jobs_callback)
def _open_logs(reg, no_detach): """Open Cylc log handlers for a flow run.""" if not no_detach: while LOG.handlers: LOG.handlers[0].close() LOG.removeHandler(LOG.handlers[0]) suite_log_handler = get_suite_run_log_name(reg) LOG.addHandler( TimestampRotatingFileHandler( suite_log_handler, no_detach)) # Add file installation log file_install_log_path = get_suite_file_install_log_name(reg) handler = TimestampRotatingFileHandler(file_install_log_path, no_detach) RSYNC_LOG.addHandler(handler)
def _manip_task_jobs_callback(self, ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy # # Note for "reload": A TaskProxy instance may be replaced on reload, so # the "itasks" list may not reference the TaskProxy objects that # replace the old ones. The .reload_successor attribute provides the # link(s) for us to get to the latest replacement. # # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. tasks = {} for itask in itasks: while itask.reload_successor is not None: itask = itask.reload_successor if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" bad_tasks = dict(tasks) for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY: del bad_tasks[(point, name, submit_num)] itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (LookupError, ValueError, KeyError) as exc: LOG.warning('Unhandled %s output: %s', ctx.cmd_key, line) LOG.exception(exc) # Task jobs that are in the original command but did not get a status # in the output. Handle as failures. for key, itask in sorted(bad_tasks.items()): line = ("|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n") summary_callback(suite, itask, ctx, line)
def satisfy_xtriggers(self, itask: TaskProxy): """Attempt to satisfy itask's xtriggers. Args: itask (TaskProxy): TaskProxy """ for label, sig, ctx, _ in self._get_xtrigs(itask, unsat_only=True): if sig.startswith("wall_clock"): # Special case: synchronous clock check. ctx.func_kwargs.update({ 'point_as_seconds': itask.get_point_as_seconds(), }) if wall_clock(*ctx.func_args, **ctx.func_kwargs): itask.state.xtriggers[label] = True self.sat_xtrig[sig] = {} LOG.info('xtrigger satisfied: %s = %s', label, sig) continue # General case: asynchronous xtrigger function call. if sig in self.sat_xtrig: if not itask.state.xtriggers[label]: itask.state.xtriggers[label] = True res = {} for key, val in self.sat_xtrig[sig].items(): res["%s_%s" % (label, key)] = val if res: xtrigger_env = [{ 'environment': { key: val } } for key, val in res.items()] self.broadcast_mgr.put_broadcast([str(ctx.point)], [itask.tdef.name], xtrigger_env) continue if sig in self.active: # Already waiting on this result. continue now = time() if sig in self.t_next_call and now < self.t_next_call[sig]: # Too soon to call this one again. continue self.t_next_call[sig] = now + ctx.intvl # Queue to the process pool, and record as active. self.active.append(sig) self.proc_pool.put_command(ctx, self.callback)
def apply_delta(key, delta, data): """Apply delta to specific data-store workflow and type.""" # Assimilate new data if getattr(delta, 'added', False): if key != WORKFLOW: data[key].update({e.id: e for e in delta.added}) elif delta.added.ListFields(): data[key].CopyFrom(delta.added) # Merge in updated fields if getattr(delta, 'updated', False): if key == WORKFLOW: # Clear fields that require overwrite with delta field_set = {f.name for f, _ in delta.updated.ListFields()} for field in CLEAR_FIELD_MAP[key]: if field in field_set: data[key].ClearField(field) data[key].MergeFrom(delta.updated) else: for element in delta.updated: try: # Clear fields that require overwrite with delta if CLEAR_FIELD_MAP[key]: for field, _ in element.ListFields(): if field.name in CLEAR_FIELD_MAP[key]: data[key][element.id].ClearField(field.name) data[key][element.id].MergeFrom(element) except KeyError as exc: # Ensure data-sync doesn't fail with # network issues, sync reconcile/validate will catch. LOG.debug('Missing Data-Store element ' 'on update application: %s' % str(exc)) continue # Prune data elements if hasattr(delta, 'pruned'): # Prune data elements by id for del_id in delta.pruned: if del_id not in data[key]: continue if key == TASK_PROXIES: data[TASKS][data[key][del_id].task].proxies.remove(del_id) getattr(data[WORKFLOW], key).remove(del_id) elif key == FAMILY_PROXIES: data[FAMILIES][data[key][del_id].family].proxies.remove(del_id) getattr(data[WORKFLOW], key).remove(del_id) elif key == EDGES: getattr(data[WORKFLOW], key).edges.remove(del_id) del data[key][del_id]
def _run_event_custom_handlers(self, schd, template_variables, event): """Helper for "run_event_handlers", custom event handlers.""" # Look for event handlers # 1. Handlers for specific event # 2. General handlers config = schd.config handlers = self.get_events_conf(config, '%s handlers' % event) if not handlers and ( event in self.get_events_conf(config, 'handler events', []) ): handlers = self.get_events_conf(config, 'handlers') if not handlers: return for i, handler in enumerate(handlers): cmd_key = ('%s-%02d' % (self.WORKFLOW_EVENT_HANDLER, i), event) try: cmd = handler % (template_variables) except KeyError as exc: message = f'{cmd_key} bad template: {handler}\n{exc}' LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = ( f"%(handler)s" f" '%({EventData.Event.value})s'" f" '%({EventData.Workflow.value})s'" f" '%({EventData.Message.value})s'" ) % ( {'handler': handler, **template_variables} ) proc_ctx = SubProcContext( cmd_key, cmd, env=dict(os.environ), shell=True # nosec (designed to run user defined code) ) if self.proc_pool.closed: # Run command in foreground if abort on failure is set or if # process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback(proc_ctx) else: # Run command using process pool otherwise self.proc_pool.put_command( proc_ctx, callback=self._run_event_handlers_callback)
def _get_host_metrics(self): """Run "cylc get-host-metrics" commands on hosts. Return (dict): {host: host-metrics-dict, ...} """ host_stats = {} # Run "cylc get-host-metrics" commands on hosts host_proc_map = {} cmd = [self.CMD_BASE] + sorted(self._get_host_metrics_opts()) # Start up commands on hosts for host in self.hosts: if is_remote_host(host): host_proc_map[host] = remote_cylc_cmd( cmd, stdin=None, host=host, capture_process=True) elif 'localhost' in host_proc_map: continue # Don't duplicate localhost else: # 1st instance of localhost host_proc_map['localhost'] = run_cmd( ['cylc'] + cmd, capture_process=True) # Collect results from commands while host_proc_map: for host, proc in list(host_proc_map.copy().items()): if proc.poll() is None: continue del host_proc_map[host] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): # Command failed in verbose/debug mode LOG.warning( "can't get host metric from '%s'" + "%s # returncode=%d, err=%s\n", host, ' '.join((quote(item) for item in cmd)), proc.returncode, err) else: # Command OK # Users may have profile scripts that write to STDOUT. # Drop all output lines until the the first character of a # line is '{'. Hopefully this is enough to find us the # first line that denotes the beginning of the expected # JSON data structure. out = ''.join(dropwhile( lambda s: not s.startswith('{'), out.splitlines(True))) host_stats[host] = json.loads(out) sleep(0.01) return host_stats
def _can_auto_restart(): """Determine whether this workflow can safely auto stop-restart.""" # Check whether there is currently an available host to restart on. try: select_workflow_host(cached=False) except HostSelectException: LOG.critical('Workflow cannot automatically restart because:\n' + 'No alternative host to restart workflow on.') return False except Exception: # Any unexpected error in host selection shouldn't be able to take # down the workflow. LOG.critical('Workflow cannot automatically restart because:\n' + 'Error in host selection:\n' + traceback.format_exc()) return False else: return True
def _listener(self): """The server main loop, listen for and serve requests.""" while True: # process any commands passed to the listener by its parent process if self.queue.qsize(): command = self.queue.get() if command == 'STOP': break else: raise ValueError('Unknown command "%s"' % command) try: # wait RECV_TIMEOUT for a message msg = self.socket.recv_string() except zmq.error.Again: # timeout, continue with the loop, this allows the listener # thread to stop continue # attempt to decode the message, authenticating the user in the # process try: message = self.decode(msg, self.secret()) except Exception as exc: # purposefully catch generic exception # failed to decode message, possibly resulting from failed # authentication import traceback return { 'error': { 'message': str(exc), 'traceback': traceback.format_exc() } } else: # success case - serve the request LOG.debug('zmq:recv %s', message) res = self._receiver(message) if message['command'] in PB_METHOD_MAP: response = res['data'] else: response = self.encode(res, self.secret()).encode() LOG.debug('zmq:send %s', res) # send back the string to bytes response self.socket.send(response) sleep(0) # yield control to other threads
def set_job_time(self, job_d, event_key, time_str=None): """Set an event time in job pool object. Set values of both event_key + '_time' and event_key + '_time_string'. """ update_time = time() point, name, sub_num = self.parse_job_item(job_d) j_id = (f'{self.workflow_id}{ID_DELIM}{point}' f'{ID_DELIM}{name}{ID_DELIM}{sub_num}') try: j_delta = PbJob(stamp=f'{j_id}@{update_time}') time_attr = f'{event_key}_time' setattr(j_delta, time_attr, time_str) self.updates.setdefault(j_id, PbJob(id=j_id)).MergeFrom(j_delta) self.updates_pending = True except (TypeError, AttributeError) as exc: LOG.error(f'Unable to set {j_id} {time_attr} field: {str(exc)}')
def clean(reg): """Remove a stopped workflow from the local filesystem only. Deletes the workflow run directory and any symlink dirs. Note: if the run dir has already been manually deleted, it will not be possible to clean the symlink dirs. Args: reg (str): Workflow name. """ run_dir = Path(get_workflow_run_dir(reg)) try: _clean_check(reg, run_dir) except FileNotFoundError as exc: LOG.info(str(exc)) return # Note: 'share/cycle' must come first, and '' must come last for possible_symlink in ( SuiteFiles.SHARE_CYCLE_DIR, SuiteFiles.SHARE_DIR, SuiteFiles.LOG_DIR, SuiteFiles.WORK_DIR, ''): name = Path(possible_symlink) path = Path(run_dir, possible_symlink) if path.is_symlink(): # Ensure symlink is pointing to expected directory. If not, # something is wrong and we should abort target = path.resolve() if target.exists() and not target.is_dir(): raise WorkflowFilesError( f'Invalid Cylc symlink directory {path} -> {target}\n' f'Target is not a directory') expected_end = str(Path('cylc-run', reg, name)) if not str(target).endswith(expected_end): raise WorkflowFilesError( f'Invalid Cylc symlink directory {path} -> {target}\n' f'Expected target to end with "{expected_end}"') # Remove <symlink_dir>/cylc-run/<reg> target_cylc_run_dir = str(target).rsplit(str(reg), 1)[0] target_reg_dir = Path(target_cylc_run_dir, reg) if target_reg_dir.is_dir(): remove_dir(target_reg_dir) # Remove empty parents _remove_empty_reg_parents(reg, target_reg_dir) remove_dir(run_dir) _remove_empty_reg_parents(reg, run_dir)
async def async_request(self, command, args=None, timeout=None): """Send an asynchronous request using asyncio. Has the same arguments and return values as ``serial_request``. """ if timeout: timeout = float(timeout) timeout = (timeout * 1000 if timeout else None) or self.timeout if not args: args = {} # get secret for this request # assumes secret won't change during the request try: secret = self.secret() except cylc.flow.suite_srv_files_mgr.SuiteServiceFileError: raise ClientError('could not read suite passphrase') # send message msg = {'command': command, 'args': args} msg.update(self.header) LOG.debug('zmq:send %s' % msg) message = encrypt(msg, secret) self.socket.send_string(message) # receive response if self.poller.poll(timeout): res = await self.socket.recv_string() else: if self.timeout_handler: self.timeout_handler() raise ClientTimeout('Timeout waiting for server response.') try: response = decrypt(res, secret) LOG.debug('zmq:recv %s' % response) except jose.exceptions.JWTError: raise ClientError( 'Could not decrypt response. Has the passphrase changed?') try: return response['data'] except KeyError: error = response['error'] raise ClientError(error['message'], error.get('traceback'))
def _run_command_init(cls, ctx, callback=None, callback_args=None): """Prepare and launch shell command in ctx.""" try: if ctx.cmd_kwargs.get('stdin_files'): if len(ctx.cmd_kwargs['stdin_files']) > 1: stdin_file = cls.get_temporary_file() for file_ in ctx.cmd_kwargs['stdin_files']: if hasattr(file_, 'read'): stdin_file.write(file_.read()) else: stdin_file.write(open(file_, 'rb').read()) stdin_file.seek(0) elif hasattr(ctx.cmd_kwargs['stdin_files'][0], 'read'): stdin_file = ctx.cmd_kwargs['stdin_files'][0] else: stdin_file = open(ctx.cmd_kwargs['stdin_files'][0], 'rb') elif ctx.cmd_kwargs.get('stdin_str'): stdin_file = cls.get_temporary_file() stdin_file.write(ctx.cmd_kwargs.get('stdin_str').encode()) stdin_file.seek(0) else: stdin_file = DEVNULL proc = procopen( ctx.cmd, stdin=stdin_file, stdoutpipe=True, stderrpipe=True, # Execute command as a process group leader, # so we can use "os.killpg" to kill the whole group. preexec_fn=os.setpgrp, env=ctx.cmd_kwargs.get('env'), usesh=ctx.cmd_kwargs.get('shell')) # calls to open a shell are aggregated in cylc_subproc.procopen() # with logging for what is calling it and the commands given except (IOError, OSError) as exc: if exc.filename is None: exc.filename = ctx.cmd[0] LOG.exception(exc) ctx.ret_code = 1 ctx.err = str(exc) cls._run_command_exit(ctx, callback, callback_args) return None else: LOG.debug(ctx.cmd) return proc
def upgrade(self): warnings = OrderedDict() for vn, upgs in self.upgrades.items(): for u in upgs: try: exp = self.expand(u) except (KeyError, UpgradeError): continue for upg in exp: try: old = self.get_item(upg['old']) except KeyError: # OK: deprecated item not found pass else: msg = self.show_keys(upg['old']) if upg['new']: msg += ' -> ' + self.show_keys(upg['new']) else: upg['new'] = upg['old'] msg += " - " + upg['cvt'].describe() if not upg['silent']: warnings.setdefault(vn, []) warnings[vn].append(msg) self.del_item(upg['old']) if upg['cvt'].describe() != "DELETED (OBSOLETE)": self.put_item(upg['new'], upg['cvt'].convert(old)) if warnings: level = WARNING if self.descr == self.SITE_CONFIG: # Site level configuration, user cannot easily fix. # Only log at debug level. level = DEBUG else: # User level configuration, user should be able to fix. # Log at warning level. level = WARNING LOG.log( level, "deprecated items were automatically upgraded in '%s':", self.descr) for vn, msgs in warnings.items(): for msg in msgs: LOG.log(level, ' * (%s) %s', vn, msg)
def _manip_task_jobs_callback( self, ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy # # Note for "reload": A TaskProxy instance may be replaced on reload, so # the "itasks" list may not reference the TaskProxy objects that # replace the old ones. The .reload_successor attribute provides the # link(s) for us to get to the latest replacement. # # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. tasks = {} for itask in itasks: while itask.reload_successor is not None: itask = itask.reload_successor if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" bad_tasks = dict(tasks) for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY: del bad_tasks[(point, name, submit_num)] itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (LookupError, ValueError, KeyError) as exc: LOG.warning( 'Unhandled %s output: %s', ctx.cmd_key, line) LOG.exception(exc) # Task jobs that are in the original command but did not get a status # in the output. Handle as failures. for key, itask in sorted(bad_tasks.items()): line = ( "|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n") summary_callback(suite, itask, ctx, line)
def _run_command_init(cls, ctx, callback=None, callback_args=None): """Prepare and launch shell command in ctx.""" try: if ctx.cmd_kwargs.get('stdin_files'): if len(ctx.cmd_kwargs['stdin_files']) > 1: stdin_file = cls.get_temporary_file() for file_ in ctx.cmd_kwargs['stdin_files']: if hasattr(file_, 'read'): stdin_file.write(file_.read()) else: stdin_file.write(open(file_, 'rb').read()) stdin_file.seek(0) elif hasattr(ctx.cmd_kwargs['stdin_files'][0], 'read'): stdin_file = ctx.cmd_kwargs['stdin_files'][0] else: stdin_file = open( ctx.cmd_kwargs['stdin_files'][0], 'rb') elif ctx.cmd_kwargs.get('stdin_str'): stdin_file = cls.get_temporary_file() stdin_file.write(ctx.cmd_kwargs.get('stdin_str').encode()) stdin_file.seek(0) else: stdin_file = open(os.devnull) proc = procopen( ctx.cmd, stdin=stdin_file, stdoutpipe=True, stderrpipe=True, # Execute command as a process group leader, # so we can use "os.killpg" to kill the whole group. preexec_fn=os.setpgrp, env=ctx.cmd_kwargs.get('env'), usesh=ctx.cmd_kwargs.get('shell')) # calls to open a shell are aggregated in cylc_subproc.procopen() # with logging for what is calling it and the commands given except (IOError, OSError) as exc: if exc.filename is None: exc.filename = ctx.cmd[0] LOG.exception(exc) ctx.ret_code = 1 ctx.err = str(exc) cls._run_command_exit(ctx, callback, callback_args) return None else: LOG.debug(ctx.cmd) return proc
def _dump_item(path, item, value): """Dump "value" to a file called "item" in the directory "path". 1. File permission should already be user-read-write-only on creation by mkstemp. 2. The combination of os.fsync and os.rename should guarantee that we don't end up with an incomplete file. """ os.makedirs(path, exist_ok=True) from tempfile import NamedTemporaryFile handle = NamedTemporaryFile(prefix=item, dir=path, delete=False) try: handle.write(value.encode()) except AttributeError: handle.write(value) os.fsync(handle.fileno()) handle.close() fname = os.path.join(path, item) os.rename(handle.name, fname) LOG.debug('Generated %s', fname)
def test_ioerror_is_ignored(self, mocked_suite_srv_files_mgr, mocked_suite_db_mgr, mocked_broadcast_mgr): """Test that IOError's are ignored when closing Scheduler logs. When a disk errors occurs, the scheduler.close_logs method may result in an IOError. This, combined with other variables, may cause an infinite loop. So it is better that it is ignored.""" mocked_suite_srv_files_mgr.return_value\ .get_suite_source_dir.return_value = "." options = Options() args = ["suiteA"] scheduler = Scheduler(is_restart=False, options=options, args=args) handler = mock.MagicMock() handler.close.side_effect = IOError handler.level = logging.INFO LOG.addHandler(handler) scheduler.close_logs() self.assertEqual(1, handler.close.call_count) LOG.removeHandler(handler)
def _rank_good_hosts(self, all_host_stats): """Rank, by specified method, 'good' hosts to return the most suitable. Take a dictionary of hosts considered 'good' with the corresponding metric data, and rank them via the method specified in the global configuration, returning the lowest-ranked (taken as best) host. """ # Convert all dict values from full metrics structures to single # metric data values corresponding to the rank method to rank with. hosts_with_vals_to_rank = dict( (host, metric[self.rank_method]) for host, metric in all_host_stats.items()) LOG.debug( "INFO: host %s values extracted are: %s", self.rank_method, "\n".join(" %s: %s" % item for item in hosts_with_vals_to_rank.items())) # Sort new dict by value to return ascending-value ordered host list. sort_asc_hosts = sorted( hosts_with_vals_to_rank, key=hosts_with_vals_to_rank.get) base_msg = ("good (metric-returning) hosts were ranked in the " "following order, from most to least suitable: %s") if self.rank_method in ("memory", "disk-space:" + self.USE_DISK_PATH): # Want 'most free' i.e. highest => reverse asc. list for ranking. LOG.debug(base_msg, ', '.join(sort_asc_hosts[::-1])) return sort_asc_hosts[-1] else: # A load av. is only poss. left; 'random' dealt with earlier. # Want lowest => ranking given by asc. list. LOG.debug(base_msg, ', '.join(sort_asc_hosts)) return sort_asc_hosts[0]