def __init__( self, host_id, talker, raise_on_failure=True, retcode=(0, ), args=None, timeout=HOUR, server_timeout=True, line_timeout=None, log_file=None, name=None, max_output_per_channel=None, set_new_logpath=None, ): self.name = name self.job_id = str(uuid4()) self.talker = talker self.retcode = None self.aborted_by = None self.ack = None self.stdout = [] self.stderr = [] self.raise_on_failure = raise_on_failure self.good_codes = retcode self.args = args self.host_id = host_id self.hostname = host_id.partition('.')[-1] self.ack_timer = None self.handling_timer = None self.timeout = timeout or HOUR # No command should run over hour, unless specified explicitly self.server_timeout = server_timeout self.line_timeout = Duration(line_timeout) if line_timeout else None self.log_file = log_file self.max_output_per_channel = max_output_per_channel or MAX_OUTPUT_PER_CHANNEL self.set_new_logpath = set_new_logpath self.client_timer = Timer( expiration=self.timeout if not server_timeout else self.timeout + 5) self._line_timeout_synced = False self.attempts = 0 # in case of TalkerCommandLost # this is the version from which the agent acknowledges receipt of command self.ack_supported = self.talker.agent_version >= V1_3_1 # this is the version from which the agent is ignores params it does not support self.kwargs_resilient = self.talker.agent_version >= V1_3_1 # this is the version from which the agent is ignores params it does not support self.pid_supported = self.talker.agent_version >= V1_6_0
def _logged_func(self): stack = ExitStack() self.exc = None self.timer = Timer() stack.callback(self.timer.stop) stack.callback(self.stop) try: if not self.console_logging: stack.enter_context(_logger.suppressed()) _logger.debug("%s - starting", self) while True: self._result = self.func(*self.args, **self.kwargs) if not self.loop: return if self.wait(self.sleep): _logger.debug("%s - stopped", self) return except ProcessExiting as exc: _logger.debug(exc) raise except KeyboardInterrupt as exc: _logger.silent_exception("KeyboardInterrupt in thread running %s:", self.func) self.exc = apply_timestamp(exc) if IS_GEVENT: raise # in gevent we should let this exception propagate to the main greenlet except Exception as exc: _logger.silent_exception( "Exception in thread running %s: %s (traceback can be found in debug-level logs)", self.func, type(exc)) self.exc = apply_timestamp(exc) finally: stack.close()
def acquire(self, blocking=True, timeout=-1, lease_expiration=None): if not blocking: ret = self._lock.acquire(blocking=False) lease_timer = self._lease_timer # touch it once, so we don't hit a race since it occurs outside of the lock acquisition if ret: self._acquired(lease_expiration) elif lease_timer and lease_timer.expired: raise LockLeaseExpired(lock=self) return ret # this timer implements the 'timeout' parameter acquisition_timer = Timer(expiration=NEVER if timeout < 0 else timeout) while not acquisition_timer.expired: # the timeout on actually acquiring this lock is the minimum of: # 1. the time remaining on the acquisition timer, set by the 'timeout' param # 2. the logging interval - the minimal frequency for logging while the lock is awaited # 3. the time remaining on the lease timer, which would raise if expired timeout = min(acquisition_timer.remain, self._log_interval) lease_timer = self._lease_timer # touch it once, so we don't hit a race since it occurs outside of the lock acquisition if lease_timer: timeout = min(lease_timer.remain, timeout) if self._lock.acquire(blocking=True, timeout=timeout): self._acquired(lease_expiration, should_log=acquisition_timer.elapsed > self._MIN_TIME_FOR_LOGGING) return True lease_timer = self._lease_timer # touch it once, so we don't hit a race since it occurs outside of the lock acquisition if lease_timer and lease_timer.expired: raise LockLeaseExpired(lock=self) _logger.debug("%s - waiting...", self)
def exclusive(self, need_to_wait_message=None): while not self.cond.acquire(timeout=15): _logger.debug("%s - waiting...", self) # wait until this thread is the sole owner of this lock while not self.cond.wait_for(lambda: self.owner_count == self.owners[_get_my_ident()], timeout=15): _check_exiting() if need_to_wait_message: _logger.info(need_to_wait_message) need_to_wait_message = None # only print it once _logger.debug("%s - waiting (for exclusivity)...", self) my_ident = _get_my_ident() self.owners[my_ident] += 1 self._lease_timer = Timer() _verbose_logger.debug("%s - acquired (exclusively)", self) try: yield finally: _verbose_logger.debug('%s - releasing (exclusive)', self) self._lease_timer = None self.owners[my_ident] -= 1 if not self.owners[my_ident]: self.owners.pop(my_ident) # don't inflate the soft lock keys with threads that does not own it self.cond.notify() self.cond.release()
def _logged_func(self): stack = ExitStack() self.exc = None self.timer = Timer() stack.callback(self.timer.stop) stack.callback(self.stop) try: if not self.console_logging: stack.enter_context(_logger.suppressed()) _logger.debug("%s - starting", self) while True: self._result = self.func(*self.args, **self.kwargs) if not self.loop: return if self.wait(self.sleep): _logger.debug("%s - stopped", self) return except ProcessExiting as exc: _logger.debug(exc) raise except (KeyboardInterrupt, Exception) as exc: _logger.silent_exception( "Exception in thread running %s: %s (traceback can be found in debug-level logs)", self.func, type(exc)) self.exc = exc try: exc.timestamp = time.time() except Exception: pass finally: stack.close()
def __call__(self): if not GRAPHICAL: yield self return handler = get_console_handler() with self._lock: self._depth += 1 if self._depth == 1: self.set_message("Waiting...") self._stop = False self._timer = Timer() self._format = handler.formatter.format if handler else lambda record: record.getMessage( ) self._thread = threading.Thread(target=self.loop, name="ProgressBar", daemon=True) self._thread.start() try: yield self finally: with self._lock: self._depth -= 1 if self._depth <= 0: self._stop = True self._event.set() self._thread.join() self._thread = None
def get_connection(self, timeout_override=None): if timeout_override: kw = dict(self._conn_params, timeout=timeout_override) _logger.debug("creating an ad-hoc connection (timeout=%s)", timeout_override) return httpclient.HTTPConnection(**kw) conn = self._conn if not conn: _logger.debug("creating a new connection (timeout=%s, ttl=%s)", self._conn_params.timeout, self._ttl) elif conn.timer.expired: _logger.debug("connection expired (%ss ago) - resetting", conn.timer.expired) conn = None else: conn.timer.reset() if not conn: conn = httpclient.HTTPConnection(**self._conn_params) if not self._ttl: return conn conn.timer = Timer(expiration=self._ttl) self._conn = conn return conn
def logged_wait(self, timeout=None, initial_log_interval=2 * MINUTE): """ Wait for all futures to complete, logging their status along the way. Logging will occur at an every-increasing log interval, beginning with ``initial_log_interval``, and increasing 5-fold (x5) every 5 iterations. """ log_interval = initial_log_interval global_timer = Timer(expiration=timeout) iteration = 0 while not global_timer.expired: completed, pending = self.wait(log_interval) if not pending: break iteration += 1 if iteration % 5 == 0: log_interval *= 5 with _logger.indented("(Waiting for %s on %s/%s tasks...)", time_duration(global_timer.elapsed), len(pending), sum(map(len, (completed, pending))), level=logging.WARNING, footer=False): self.dump_stacks(pending, verbose=global_timer.elapsed >= HOUR)
def _logged_func(self): self.timer = Timer() self.exc = None try: _logger.debug("%s - starting", self) while True: self.result = self.func(*self.args, **self.kwargs) if not self.loop: return if self.wait(self.sleep): _logger.debug("%s - stopped", self) return except Exception as exc: _logger.silent_exception("Exception in thread running %s (traceback can be found in debug-level logs)", self.func) self.exc = exc finally: self.timer.stop() self.stop()
def wait(self, timeout=None): if self.real_thread_no_greenlet: # we can't '.wait' on this gevent event object, so instead we test it and sleep manually: timer = Timer(expiration=timeout) while not timer.expired: if self.stopper.is_set(): return True non_gevent_sleep(0.1) return False return self.stopper.wait(timeout)
def retry(times, func, args=[], kwargs={}, acceptable=Exception, sleep=1, max_sleep=False, log_level=logging.DEBUG, pred=None, unacceptable=()): if unacceptable is None: unacceptable = () elif isinstance(unacceptable, tuple): unacceptable += UNACCEPTABLE_EXCEPTIONS else: unacceptable = (unacceptable, ) + UNACCEPTABLE_EXCEPTIONS if isinstance(times, Timer): stopper = times # a timer is a valid stopper elif isinstance(times, Duration): stopper = Timer(expiration=times) elif isinstance(times, int): stopper = ExpiringCounter(times) else: assert False, "'times' must be an 'int', 'Duration' or 'Timer', got %r" % times if max_sleep: sleep = RandomExponentialBackoff(sleep, max_sleep) if not pred: pred = lambda exc: True while True: try: return func(*args, **kwargs) except unacceptable as exc: raise except acceptable as exc: raise_if_async_exception(exc) if not pred(exc): raise if stopper.expired: raise _logger.log(log_level, "Exception thrown: %r", exc) sleep_for = 0 if sleep: # support for ExponentialBackoff sleep_for = sleep() if callable(sleep) else sleep _logger.log(log_level, "Retrying... (%r remain) in %s seconds", stopper.remain, sleep_for) time.sleep(sleep_for)
def test_time_interval1(): st = 150000000 t = Timer(st) t.t1 = t.t0 + 1 ti = t.to_interval() assert t in ti assert t.t0 in ti assert t.t1 in ti assert t.t0 - 1 not in ti assert t.t1 + 1 not in ti assert ti.duration == t.duration assert ti.duration == 1 assert ti.duration_delta.total_seconds() == 1 assert str(ti) == '<TI 05:40:00..(1s)..05:40:01>' assert str(t.to_interval().to_timer()) == str(t)
def _get_main_loop(self): while True: items = [] items.append(self._commands_queue.get()) t = Timer(expiration=10.0 / 1000.0) while not t.expired: try: items.append(self._commands_queue.get(timeout=t.remain)) except Empty: break if not items: continue self._current_workers.acquire(timeout=10) self._executors.submit(_run_with_exception_logging, self._send_data, (items, ), {}, {})
def test_wait_long_predicate(): """ After the actual check the predicate is held for 3 seconds. Make sure that we don't get a timeout after 2 seconds - because the actual condition should be met in 1 second! """ t = Timer() def pred(): try: return 0.1 < t.duration finally: wait(0.3) wait(0.2, pred, message=False)
def logged_wait(self, timeout=None, initial_log_interval=None): log_interval = initial_log_interval or 2*MINUTE global_timer = Timer(expiration=timeout) iteration = 0 while not global_timer.expired: completed, pending = self.wait(log_interval) if not pending: break iteration += 1 if iteration % 5 == 0: log_interval *= 5 with _logger.indented("(Waiting for %s on %s/%s tasks...)", time_duration(global_timer.elapsed), len(pending), sum(map(len, (completed, pending))), level=logging.WARNING, footer=False): self.dump_stacks(pending, verbose=global_timer.elapsed >= HOUR)
def wait_and_kill(self): timer = Timer(expiration=self.timeout) if not self.quiet: _logger.info("Timebomb set - this process will YELLOW<<self-destruct>> in RED<<%r>>...", timer.remain) while not timer.expired: if self.alert_interval: _logger.info("Time Elapsed: MAGENTA<<%r>>", timer.elapsed) log_level = logging.WARNING if timer.remain < 5*MINUTE else logging.DEBUG _logger.log(log_level, "This process will YELLOW<<self-destruct>> in RED<<%r>>...", timer.remain) if self.fuse.wait(min(self.alert_interval or 60, timer.remain)): _logger.info("Timebomb cancelled") return _logger.warning("RED<< 💣 Timebomb Expired! 💣 >>") with _logger.indented("Killing children"): kill_subprocesses() with _logger.indented("Committing suicide"): kill_this_process(graceful=False) raise Exception("Timebomb Expired")
def put_command(self, *, job_repr, **job_data): """ Put the command in the commands queue For internal use :param [str] job_repr: A string to print the command for logging purposes :param [dict] job_data: Command data to send to the Talker agent """ get_logger().debug("submitting job [%s] (%s)", job_repr, self.job_id) self.talker.reactor.rpush(self._commands_key, json.dumps(job_data), _callback=self.on_sent, _cmd_id=self.job_id) # making expiration long enough so it's unlikely to expire in the middle pushing a new command - see WEKAPP-86513 # a 5m expiration could fall exactly when we push a new command, causing it to expire before the agent pops it out; # a 1d expiration is long enough since new commands are likely to be pushed in the interim, extending the expiration # we can't call 'expire' before 'rpush', since it will be a no-op if the key doesn't exist yet self.talker.reactor.expire(self._commands_key, timedelta(days=1)) self.handling_timer = Timer(expiration=AGENT_SEND_TIMEOUT)
def test_iter_wait_progress_inbetween_sleep(): data = Bunch(a=3) def get(): data.a -= 1 return data.a sleep = .07 g = iter_wait_progress(get, advance_timeout=10, sleep=sleep) # first iteration should be immediate t = Timer() next(g) assert t.duration < sleep # subsequent iteration should be at least 'sleep' long next(g) assert t.duration >= sleep for state in g: pass assert state.finished is True
def retry(times: Union[int, Duration, Timer], func, args=[], kwargs={}, acceptable=Exception, sleep=1, max_sleep=False, log_level=logging.DEBUG, pred=None, unacceptable=()): """ Runs a function again and again until it finishes without throwing. :param times: Limit number of attempts before errors are propagated instead of suppressed. If an `int`, the execution is retried at most `times` times. If a `Timer`, retries until the `times` expires. If a `Duration`, retires for the specified duarion. :param func: The function to run. :param list args: Positional arguments for the function. :param dict kwargs: Keyword arguments for the function. :param acceptable: Exception (or tuple of exceptions) to repeat the function on. :param sleep: Time to wait between multiple retries. :param max_sleep: If set, increase the sleep after each retry until reaching this number. :param log_level: Before retrying, log the exceptions at that level. :param pred: If set, decide whether or not to retry on an exception. :param unacceptable: Exception (or tuple of exceptions) to not repeat the function on and re-raise them instead. """ if unacceptable is None: unacceptable = () elif isinstance(unacceptable, tuple): unacceptable += UNACCEPTABLE_EXCEPTIONS else: unacceptable = (unacceptable, ) + UNACCEPTABLE_EXCEPTIONS if isinstance(times, Timer): stopper = times # a timer is a valid stopper elif isinstance(times, Duration): stopper = Timer(expiration=times) elif isinstance(times, int): stopper = ExpiringCounter(times) else: assert False, "'times' must be an 'int', 'Duration' or 'Timer', got %r" % times if max_sleep: sleep = RandomExponentialBackoff(sleep, max_sleep) if not pred: pred = lambda exc: True while True: try: return func(*args, **kwargs) except unacceptable as exc: raise except acceptable as exc: raise_if_async_exception(exc) if not pred(exc): raise if stopper.expired: raise _logger.log(log_level, "Exception thrown: %r", exc) sleep_for = 0 if sleep: # support for ExponentialBackoff sleep_for = sleep() if callable(sleep) else sleep _logger.log(log_level, "Retrying... (%r remain) in %s seconds", stopper.remain, sleep_for) time.sleep(sleep_for)
def _acquired(self, lease_expiration, should_log=False): # we don't want to replace the lease timer, so not to effectively extend the original lease self._lease_timer = self._lease_timer or Timer(expiration=lease_expiration or self._lease_expiration) if should_log: _logger.debug("%s - acquired", self)
class Cmd(object): """ A class to represent a command to run on a host :param [str] host_id: Host ID to run the command on :param [TalkerClient] talker: The Talker client that is used to send the command :param [bool] raise_on_failure: Should the command raise an exception on failure :param [Tuple[int]] retcode: What return codes are considered success :param [List[str]] args: List of the command arguments :param [float, int] timeout: Command timeout in seconds :param [float, int] server_timeout: Command timeout on the agent in seconds :param [float, int] line_timeout: Command output timeout in seconds :param [str] log_file: Log file path to write command output to in the host :param [str] name: Command name for logging purposes :param [int] max_output_per_channel: Maximum size of the command output to generate in bytes :param [str] set_new_logpath: Set a new path for the Talker agent log file """ def __init__( self, host_id, talker, raise_on_failure=True, retcode=(0, ), args=None, timeout=HOUR, server_timeout=True, line_timeout=None, log_file=None, name=None, max_output_per_channel=None, set_new_logpath=None, ): self.name = name self.job_id = str(uuid4()) self.talker = talker self.retcode = None self.aborted_by = None self.ack = None self.stdout = [] self.stderr = [] self.raise_on_failure = raise_on_failure self.good_codes = retcode self.args = args self.host_id = host_id self.hostname = host_id.partition('.')[-1] self.ack_timer = None self.handling_timer = None self.timeout = timeout or HOUR # No command should run over hour, unless specified explicitly self.server_timeout = server_timeout self.line_timeout = Duration(line_timeout) if line_timeout else None self.log_file = log_file self.max_output_per_channel = max_output_per_channel or MAX_OUTPUT_PER_CHANNEL self.set_new_logpath = set_new_logpath self.client_timer = Timer( expiration=self.timeout if not server_timeout else self.timeout + 5) self._line_timeout_synced = False self.attempts = 0 # in case of TalkerCommandLost # this is the version from which the agent acknowledges receipt of command self.ack_supported = self.talker.agent_version >= V1_3_1 # this is the version from which the agent is ignores params it does not support self.kwargs_resilient = self.talker.agent_version >= V1_3_1 # this is the version from which the agent is ignores params it does not support self.pid_supported = self.talker.agent_version >= V1_6_0 def __repr__(self): return "%s(<%s>%s)" % (self.__class__.__name__, self.job_id, "!" if (self.ack_supported and not self.ack) else "") @property def _result_key(self): return "result-%s" % self.job_id @property def _ack_key(self): return "%s-ack" % self._result_key @property def _stdout_key(self): return "%s-stdout" % self._result_key @property def _stderr_key(self): return "%s-stderr" % self._result_key @property def _exit_code_key(self): return "%s-retcode" % self._result_key @safe_property def _pid_key(self): return "%s-pid" % self._result_key @property def _commands_key(self): return "commands-%s" % self.host_id @property def timed_out(self): """ Is this command timed out :rtype: bool """ return self.aborted_by == AbortedBy.timeout @property def since_started(self): """ Get the time since the command acked :rtype: Duration """ return Duration(time.time() - self.ack) if self.ack else None def get_pid(self, timeout=JOB_PID_TIMEOUT, wait_for_ack=True): """ Get the command process ID :param [int, float] timeout: After how many seconds should we throw a timeout error :param [bool] wait_for_ack: Wait for ack before checking for the command PID :returns: The command PID :rtype: int """ if not self.pid_supported: raise NotImplementedError( 'This version (%s) does not support get_pid - need %s or better' % (self.talker.agent_version, V1_6_0)) if wait_for_ack: self.wait(for_ack=True) res = self.talker.reactor.get(self._pid_key, timeout=JOB_PID_TIMEOUT) return int(res) def put_command(self, *, job_repr, **job_data): """ Put the command in the commands queue For internal use :param [str] job_repr: A string to print the command for logging purposes :param [dict] job_data: Command data to send to the Talker agent """ get_logger().debug("submitting job [%s] (%s)", job_repr, self.job_id) self.talker.reactor.rpush(self._commands_key, json.dumps(job_data), _callback=self.on_sent, _cmd_id=self.job_id) # making expiration long enough so it's unlikely to expire in the middle pushing a new command - see WEKAPP-86513 # a 5m expiration could fall exactly when we push a new command, causing it to expire before the agent pops it out; # a 1d expiration is long enough since new commands are likely to be pushed in the interim, extending the expiration # we can't call 'expire' before 'rpush', since it will be a no-op if the key doesn't exist yet self.talker.reactor.expire(self._commands_key, timedelta(days=1)) self.handling_timer = Timer(expiration=AGENT_SEND_TIMEOUT) @log_context(host="{.hostname}") def send(self): """ Send the command to the Talker agent on the host """ args = [str(a) for a in self.args] command_string = ' '.join(x if ' ' not in x else '"%s"' % x for x in args) timeout = self.timeout if self.server_timeout else None extras = {} if self.kwargs_resilient: extras.update( max_output_per_channel=self.max_output_per_channel, set_new_logpath=self.set_new_logpath, line_timeout=self.line_timeout, log_file=self.log_file, ) self.put_command(id=self.job_id, cmd=args, timeout=timeout, job_repr=compact(command_string, 100), **extras) def on_sent(self): if not self.ack_timer: self.ack_timer = Timer( expiration=TALKER_CONTEXT.ack_timeout ) # this is expiration on the agent ack'ing the command @property def is_sent(self): """ Is the command sent to the host :rtype: bool """ return bool(self.ack_timer) @log_context(host="{.hostname}") def reset_server_timeout(self): """ Reset timeout on the Talker agent for this command """ _logger.debug("Reseting timeout, %s, %s", self.job_id, self.timeout) self.client_timer.reset() self.talker.reactor.rpush( self._commands_key, json.dumps( dict(id=self.job_id, cmd="reset_timeout", new_timeout=self.timeout))) self.talker.reactor.expire(self._commands_key, timedelta(minutes=5)) @log_context(host="{.hostname}") def send_signal(self, sig): """ Send a signal for the process running te command :param [int] sig: The signal to send to the command """ self.talker.reactor.rpush( self._commands_key, json.dumps(dict( id=self.job_id, cmd="signal", signal=sig, ))) self.talker.reactor.expire(self._commands_key, timedelta(minutes=5)) @log_context(host="{.hostname}") def kill(self, graceful_timeout=3): """ Kill the process running the command This will send a SIGTERM signal to the process, wait for graceful_timeout seconds, check if the process died and if not, send the SIGKILL signal. :param [int, float] graceful_timeout: Seconds to wait between the SIGTERM and SIGKILL signals """ self.talker.reactor.rpush( self._commands_key, json.dumps( dict( id=self.job_id, cmd="kill", graceful_timeout=graceful_timeout, ))) self.talker.reactor.expire(self._commands_key, timedelta(minutes=5)) def _sync_timeout(self, line_timeout): if self._line_timeout_synced: return else: self._line_timeout_synced = True if line_timeout and line_timeout > self.timeout: self.timeout = int(line_timeout * 1.15) self.reset_server_timeout() def iter_results(self, line_timeout=DAY, timeout=DAY): """ Iterate return code and output results :param [int, float] line_timeout: Timeout for new output to be recieved in seconds :param [int, float] timeout: Timout for command to finish in seconds :returns: A Tuple with the channel name and value. Channel names are `retcode`, `stdout` and `stderr` :rtype: Tuple[str, str] """ timeout = timeout or DAY line_timeout = line_timeout or DAY blpop_timeout = min(line_timeout, timeout // 10, self.timeout // 10, AGENT_ACK_TIMEOUT // 10) session_timer = Timer( expiration=timeout ) # timeout for this invocation (not entire command duration) line_timer = Timer(expiration=line_timeout) timeout_reset_timer = Timer(expiration=max(line_timeout // 10, 1)) self._sync_timeout(line_timeout=line_timeout) while self.retcode is None: if line_timer.expired: self.raise_exception(exception_cls=CommandLineTimeout, line_timeout=Duration(line_timeout)) if session_timer.expired: self.raise_exception(exception_cls=CommandTimeout, timeout=timeout) channels = { self._ack_key: 'ack', self._stderr_key: 'stderr', self._stdout_key: 'stdout', self._exit_code_key: 'retcode' } res = self.talker.reactor.blpop([ self._ack_key, self._stderr_key, self._stdout_key, self._exit_code_key ], timeout=blpop_timeout) if not res: if not self.ack: # no point checking these timers timeout if the command hasn't been acknowledged yet line_timer.reset() session_timer.reset() self.check_client_timeout() continue else: channel_name, value = res channel_name = channels[channel_name.decode('utf-8')] line_timer.reset() if timeout_reset_timer.expired: timeout_reset_timer.reset() self.reset_server_timeout() if channel_name == 'stderr': self.on_output(self.stderr, (value, )) elif channel_name == 'stdout': self.on_output(self.stdout, (value, )) elif channel_name == 'retcode': # see if any leftovers after receiving exit code stdout, stderr = self.get_output() for data_channel_name, values in (('stdout', stdout), ('stderr', stderr)): for data_value in values: yield (data_channel_name, data_value) value = self.set_retcode(value) elif channel_name == 'ack': self.on_ack(value) continue yield (channel_name, value) self.raise_if_needed() def iter_lines(self, line_timeout=DAY, timeout=DAY): """ Iterate command output :param [int, float] line_timeout: Timeout for new output to be recieved in seconds :param [int, float] timeout: Timout for command to finish in seconds :returns: A Tuple containing a line from stdout or None and a line from stderr or None :rtype: Tuple[str, str] """ done = False parts = {'stdout': '', 'stderr': ''} def return_for_channel(channel, line): if channel == 'stdout': return (line, None) elif channel == 'stderr': return (None, line) def yield_channel(channel): lines = parts[channel].splitlines(keepends=True) parts[channel] = '' for line in lines: if line.endswith('\n'): yield return_for_channel(channel, line.strip('\n')) else: parts[channel] = line if done: yield return_for_channel(channel, parts[channel]) for channel, value in self.iter_results(line_timeout=line_timeout, timeout=timeout): if channel in ['stdout', 'stderr']: try: parts[channel] += value.decode('utf-8') except UnicodeDecodeError: _logger.warning("Unable to decode %s to utf-8!", value) parts[channel] += "%s" % value yield from yield_channel(channel) elif channel == 'retcode': done = True @log_context(host="{.hostname}") def log_pipe(self, logger, stdout_lvl=logging.INFO, stderr_lvl=logging.ERROR, line_timeout=DAY): """ Pipe command output to a logger :param [logging.Logger] logger: The logger to use to log command output :param [int] stdout_lvl: Logging level to use in order to log stdout :param [int] stderr_lvl: Logging level to use in order to log stderr :param [int, float] line_timeout: Timeout for new output to be recieved in seconds """ for stdout, stderr in self.iter_lines(line_timeout=line_timeout): if stdout_lvl is not None and stdout: logger.log(stdout_lvl, stdout) if stderr_lvl is not None and stderr: logger.log(stderr_lvl, stderr) @log_context(host="{.hostname}") def foreground(self, stdout=True, stderr=True): """ Print command output as it is recieved from the agent :param [bool] stdout: Should stdout be printed :param [bool] stderr: Should stderr be printed :returns: The command return code when done :rtype: int """ should_print = dict(stdout=stdout, stderr=stderr) for channel, value in self.iter_results(): if channel in ['stdout', 'stderr'] and should_print[channel]: print(value.decode('utf-8'), end='') elif channel == 'retcode': return self.retcode else: self.raise_exception(exception_cls=UnknownChannel, channel=channel) def check_client_timeout(self): """ Check if the command has timed out This will raise an appropriate exception depends on what has timed out: * If the reactor haven't sent the command before the handling timer expired: TalkerClientSendTimeout * If the command has yet to be acked and the agent is alive: TalkerCommandLost * If the command has yet to be acked and the agent is not responding: TalkerServerTimeout * If ack is not supported and the command has expired: ClientCommandTimeoutError """ if not self.is_sent: # client/reactor did not send command yet if self.handling_timer.expired: self.raise_exception(exception_cls=TalkerClientSendTimeout, timeout=self.handling_timer.elapsed) elif self.ack_supported and not self.ack: is_talker_alive_name = 'is_talker_alive' # client did not receive command - check all timers if self.ack_timer.expired or self.client_timer.expired: talker_alive = False if self.name != is_talker_alive_name: # prevent recursion try: self.talker.run(self.host_id, 'true', name=is_talker_alive_name, timeout=5).wait() except TalkerServerTimeout: pass else: # check again, since talker may have just booted self.poll(check_client_timeout=False) if self.ack: # seems like our command was finally answered _logger.debug( "we thought TalkerCommandLost, but finally not so... (%s)", self) return talker_alive = True if self.attempts < TALKER_COMMAND_LOST_RETRY_ATTEMPTS: self.attempts += 1 self.ack_timer = None self.send() return exception_cls = TalkerCommandLost if talker_alive else TalkerServerTimeout self.raise_exception(exception_cls=exception_cls, timeout=self.ack_timer.elapsed) elif self.client_timer.expired: self.raise_exception(exception_cls=ClientCommandTimeoutError, timeout=self.ack_timer.elapsed, started=self.since_started) def raise_if_needed(self): """ Raise an error for the command failure. If the command succeeded, do nothing """ if self.aborted_by == AbortedBy.timeout: self.raise_exception(exception_cls=CommandTimeoutError, timeout=Duration(self.timeout)) if self.aborted_by == AbortedBy.reboot: self.raise_exception(exception_cls=CommandAbortedByReboot) if self.aborted_by == AbortedBy.overflowed: self.raise_exception( exception_cls=CommandAbortedByOverflow, max_output_per_channel=self.max_output_per_channel) if self.aborted_by == AbortedBy.hanging: self.raise_exception(exception_cls=CommandHanging) if self.aborted_by == AbortedBy.orphaned: self.raise_exception(exception_cls=CommandOrphaned) if self.aborted_by == AbortedBy.line_timeout: self.raise_exception(exception_cls=CommandLineTimeout, line_timeout=self.line_timeout) if self.aborted_by == AbortedBy.error: _, traceback = self.get_output(new=False) traceback = self._decode_output(traceback, safe=True).strip() exception = traceback.splitlines()[-1] raise TalkerError(host_id=self.host_id, hostname=self.hostname, fulltb=traceback, _exception=exception) if self.retcode is None: return if self.raise_on_failure and self.retcode not in self.good_codes: self.raise_exception() def raise_exception(self, exception_cls=CommandExecutionError, **params): """ Raise an exception for this command """ def process_output(d): text = self._decode_output(d, safe=True) if len(text) > 1010: text = "...%s" % text[-1000:] return text log_files = self.log_file if isinstance( self.log_file, (tuple, list)) else [self.log_file] * 2 stds = [None] * 2 if all(log_files) else (process_output(std) for std in self.get_output( new=False)) for (param, std, log_file) in zip("stdout stderr".split(), stds, log_files): params[param] = ("<%s>" % log_file) if log_file else std raise exception_cls(cmd=self.job_id, name=self.name or "(unnamed command)", args=" ".join(map(str, self.args)), host_id=self.host_id, hostname=self.hostname, retcode=self.retcode, since_started=self.since_started, talker=self.talker, **params) def on_polled(self, exit_code): """ Report return code for the command :param [int] exit_code: The command return code :returns: The command return code or None if not yet set :rtype: int """ if self.retcode is not None: return self.retcode if exit_code is None: return _verbose_logger.debug("%s: exit-code received (%s)", self, exit_code) self.set_retcode(exit_code) return self.retcode def poll(self, check_client_timeout=True): """ Check if the command is finished and return it's return code :param [bool] check_client_timeout: Also check if the command timed out :returns: The command return code if done :rtype: int """ if self.retcode is not None: return self.retcode if self.ack_supported and not self.ack: ack = self.talker.reactor.lpop(self._ack_key) if ack is None: if check_client_timeout: self.check_client_timeout() return self.on_ack(ack) exit_code = self.talker.reactor.lpop(self._exit_code_key) if exit_code is None: if check_client_timeout: self.check_client_timeout() return self.on_polled(exit_code) def set_retcode(self, exit_code): """ Set the command return code :param [int] exit_code: The command return code """ if exit_code is None: return try: exit_code = int(exit_code) except ValueError: exit_code = exit_code.decode('utf-8') self.on_sent() # sometimes we get here before the callback is called self.ack_timer.stop() if isinstance(exit_code, int): self.retcode = exit_code elif hasattr(AbortedBy, exit_code): self.aborted_by = getattr(AbortedBy, exit_code) self.retcode = -1 else: self.raise_exception(exception_cls=UnknownExitCode, exit_code=exit_code) self.raise_if_needed() def wait(self, for_ack=False): """ Wait until the command is done :param [int] for_ack: Wait for the command to be acked instead of command to finish :returns: The command return code when done :rtype: int """ while True: if for_ack: assert self.ack_supported, "ack is not supported" if self.ack is not None: return self.ack if self.retcode is not None: return self.retcode # we don't want to wait too long, cause we want to raise timeout exceptions promptly blpop_timeout = (1 if not self.is_sent else self.ack_timer.expiration // 10 if (self.ack_supported and not self.ack) else self.client_timer.expiration // 10) result = self.talker.reactor.blpop( [self._exit_code_key, self._ack_key], timeout=blpop_timeout) if result is None: self.check_client_timeout() continue key, value = result key = key.decode() if key == self._ack_key: _verbose_logger.debug("%s: ack received (%s)", self, value) self.on_ack(value) continue return self.on_polled(result[1]) def result(self, decode='utf8'): """ Wait until the command is done and return it's stdout output :param [str] decode: Expected output encoding :returns: The command stdout output :rtype: str """ self.wait() stdout, _ = self.get_output(new=False) return self._decode_output(stdout, decode=decode) def _decode_output(self, output, decode='utf8', safe=False): ret = b''.join(output) if decode: ret = ret.decode(decode, errors='replace' if safe else 'strict') return ret def _request_outputs(self, pipeline): pipeline.lrange(self._stdout_key, 0, -1) pipeline.lrange(self._stderr_key, 0, -1) def _fetch_outputs(self): with self.talker.reactor.pipeline() as p: self._request_outputs(p) new_stdout, new_stderr = p.execute() return new_stdout, new_stderr def _trim_outputs(self, stdout, stderr, pipeline=None): len_stdout = len(stdout) len_stderr = len(stderr) with ExitStack() as stack: if pipeline is not None: p = pipeline else: p = stack.enter_context(self.talker.reactor.pipeline()) for data_len, redis_key in \ ((len_stdout, self._stdout_key), (len_stderr, self._stderr_key)): if data_len: p.ltrim(redis_key, data_len, -1) if pipeline is None: p.execute() def on_output(self, channel, data): """ Update the command output :param [List[bytes]] channel: The channel to update. One of `self.stdout` or `self.stderr` :param [bytes] data: New data to add to one of the channels """ if data: channel.extend(data) def on_ack(self, ack): """ Update the command ack :param [bytes] ack: The command start time on the host """ self.ack = float(ack.decode()) def get_output(self, new=True): """ Get raw command output :param [bool] new: Get only new output :returns: Stdout and stderr from the command :rtype: Tuple[bytes, bytes] """ new_stdout, new_stderr = self._fetch_outputs() self.on_output(self.stdout, new_stdout) self.on_output(self.stderr, new_stderr) self._trim_outputs(stdout=new_stdout, stderr=new_stderr) if new: return new_stdout, new_stderr else: return self.stdout, self.stderr
def iter_results(self, line_timeout=DAY, timeout=DAY): """ Iterate return code and output results :param [int, float] line_timeout: Timeout for new output to be recieved in seconds :param [int, float] timeout: Timout for command to finish in seconds :returns: A Tuple with the channel name and value. Channel names are `retcode`, `stdout` and `stderr` :rtype: Tuple[str, str] """ timeout = timeout or DAY line_timeout = line_timeout or DAY blpop_timeout = min(line_timeout, timeout // 10, self.timeout // 10, AGENT_ACK_TIMEOUT // 10) session_timer = Timer( expiration=timeout ) # timeout for this invocation (not entire command duration) line_timer = Timer(expiration=line_timeout) timeout_reset_timer = Timer(expiration=max(line_timeout // 10, 1)) self._sync_timeout(line_timeout=line_timeout) while self.retcode is None: if line_timer.expired: self.raise_exception(exception_cls=CommandLineTimeout, line_timeout=Duration(line_timeout)) if session_timer.expired: self.raise_exception(exception_cls=CommandTimeout, timeout=timeout) channels = { self._ack_key: 'ack', self._stderr_key: 'stderr', self._stdout_key: 'stdout', self._exit_code_key: 'retcode' } res = self.talker.reactor.blpop([ self._ack_key, self._stderr_key, self._stdout_key, self._exit_code_key ], timeout=blpop_timeout) if not res: if not self.ack: # no point checking these timers timeout if the command hasn't been acknowledged yet line_timer.reset() session_timer.reset() self.check_client_timeout() continue else: channel_name, value = res channel_name = channels[channel_name.decode('utf-8')] line_timer.reset() if timeout_reset_timer.expired: timeout_reset_timer.reset() self.reset_server_timeout() if channel_name == 'stderr': self.on_output(self.stderr, (value, )) elif channel_name == 'stdout': self.on_output(self.stdout, (value, )) elif channel_name == 'retcode': # see if any leftovers after receiving exit code stdout, stderr = self.get_output() for data_channel_name, values in (('stdout', stdout), ('stderr', stderr)): for data_value in values: yield (data_channel_name, data_value) value = self.set_retcode(value) elif channel_name == 'ack': self.on_ack(value) continue yield (channel_name, value) self.raise_if_needed()
def on_sent(self): if not self.ack_timer: self.ack_timer = Timer( expiration=TALKER_CONTEXT.ack_timeout ) # this is expiration on the agent ack'ing the command
class concurrent(object): def __init__(self, func, *args, **kwargs): self.func = func self.args = args self.kwargs = kwargs self.throw = kwargs.pop('throw', True) self.daemon = kwargs.pop('daemon', True) self.threadname = kwargs.pop('threadname', 'anon-%X' % id(self)) self.stopper = kwargs.pop('stopper', threading.Event()) self.sleep = kwargs.pop('sleep', 1) self.loop = kwargs.pop('loop', False) self.timer = None real_thread_no_greenlet = kwargs.pop('real_thread_no_greenlet', False) if is_module_patched("threading"): # in case of using apply_gevent_patch function - use this option in order to defer some jobs to real threads self.real_thread_no_greenlet = real_thread_no_greenlet else: # gevent isn't active, no need to do anything special self.real_thread_no_greenlet = False rimt = kwargs.pop("raise_in_main_thread", False) if rimt: exc_type = Exception if rimt is True else rimt self.func = raise_in_main_thread(exc_type)(self.func) def __repr__(self): flags = "" if self.daemon: flags += 'D' if self.loop: flags += 'L' if self.real_thread_no_greenlet: flags += 'T' return "<%s[%s] '%s'>" % (self.__class__.__name__, self.threadname, flags) def _logged_func(self): self.timer = Timer() self.exc = None try: _logger.debug("%s - starting", self) while True: self.result = self.func(*self.args, **self.kwargs) if not self.loop: return if self.wait(self.sleep): _logger.debug("%s - stopped", self) return except Exception as exc: _logger.silent_exception("Exception in thread running %s (traceback can be found in debug-level logs)", self.func) self.exc = exc finally: self.timer.stop() self.stop() def stop(self): _logger.debug("%s - stopping", self) self.stopper.set() def wait(self, timeout=None): if self.real_thread_no_greenlet: # we can't '.wait' on this gevent event object, so instead we test it and sleep manually: if self.stopper.is_set(): return True non_gevent_sleep(timeout) if self.stopper.is_set(): return True return False return self.stopper.wait(timeout) @contextmanager def paused(self): self.stop() yield self.start() @contextmanager def _running(self): if _disabled: self._logged_func() yield self return if self.real_thread_no_greenlet: _logger.debug('sending job to a real OS thread') self._join = defer_to_thread(func=self._logged_func, threadname=self.threadname) else: # threading.Thread could be a real thread or a gevent-patched thread... self.thread = threading.Thread(target=self._logged_func, name=self.threadname, daemon=self.daemon) _logger.debug('sending job to %s', self.thread) self.stopper.clear() self.thread.start() self._join = self.thread.join try: yield self finally: self.stop() # if we loop, stop it self._join() if self.throw and self.exc: raise self.exc def __enter__(self): self._ctx = self._running() return self._ctx.__enter__() def __exit__(self, *args): return self._ctx.__exit__(*args) def __iter__(self): with self: self.iterations = 0 while not self.wait(self.sleep): yield self self.iterations += 1 start = __enter__ def join(self): self.__exit__(None, None, None) __del__ = join
def as_completed(self, cmds, sleep=0.5, timeout=DAY, initial_log_interval=12): """ Yields completed commands, until all are completed or the timeout expires (if provided). If timeout is 0, this will effectively stop the generator instead of blocking on remaining commands :param [List[Cmd]] cmds: The list of command to wait on :param [int, float] sleep: How long to sleep between checks :param [int, float] timeout: After how many seconds should we throw a timeout error :param [int, float] initial_log_interval: Interval between logging :returns: Yields commands and their results as a tuple :rtype: Tuple[Cmd, result???] """ pending = list(cmds) # We prevent exceptions from being raised from this iterator, so that we can keep iterating # through all cmds. we'll let the caller deal with raising exceptions mid-iteration if it so desires # We also need to deal with the case of re-entering this iterator with incomplete cmds, so we must # keep the original 'raise_on_failure' setting across invocations for cmd in pending: cmd._should_raise_on_failure = getattr(cmd, "_should_raise_on_failure", cmd.raise_on_failure) cmd.raise_on_failure = False log_interval = initial_log_interval logging_timer = BackoffTimer(expiration=log_interval, max_interval=HOUR) timeout_timer = Timer(expiration=timeout) had_results = False def _poll(): nonlocal had_results results = self.poll(pending) done_idxes = [] for i, result in enumerate(results): cmd = pending[i] if result is not None: done_idxes.append(i) cmd.raise_on_failure = cmd._should_raise_on_failure yield cmd, result for i in sorted( done_idxes, reverse=True ): # Alternative would be to copy pending list, but this is more efficient pending.pop(i) if pending: if logging_timer.expired: logging_timer.backoff() MultiObject(pending).check_client_timeout() hosts = sorted(set(cmd.hostname for cmd in pending)) _logger.info("Waiting on %s command(s) on %s host(s): %s", len(pending), len(hosts), ", ".join(hosts)) for cmd in pending: since_started = ( "no-ack" if not cmd.ack_supported else "acked {:ago}".format(cmd.since_started) if cmd.ack else "not-started") _logger.debug(" job-id: %s (%s)", cmd.job_id, since_started) if done_idxes: had_results = True while True: yield from _poll() if not pending: return if timeout == 0: if had_results: return else: continue if timeout_timer.expired: raise TimeoutException( "Commands (%s) did not complete within timeout" % len(pending), timeout=timeout, pending=pending)