def run_container_start_checks(self, started_at, timeout_at): """ Run startup checks. """ checks_start_time = time.time() while time.time() <= timeout_at: if not self.is_running(): raise FactoryNotStarted("{} is no longer running".format(self)) if self._container_start_checks(): break else: log.error( "Failed to run container start checks after %1.2f seconds", time.time() - checks_start_time, ) return False check_ports = set(self.get_check_ports()) if not check_ports: return True while time.time() <= timeout_at: if not self.is_running(): raise FactoryNotStarted("{} is no longer running".format(self)) if not check_ports: break check_ports -= ports.get_connectable_ports(check_ports) if check_ports: time.sleep(0.5) else: log.error("Failed to check ports after %1.2f seconds", time.time() - checks_start_time) return False return True
def wait_for_events(self, patterns, timeout=30, after_time=None): """ Wait for a set of patterns to match or until timeout is reached. :param ~collections.abc.Sequence pattern: An iterable of tuples in the form of ``("<daemon-id>", "<event-tag-pattern>")``, ie, which daemon ID we're targeting and the event tag pattern which will be passed to :py:func:`~fnmatch.fnmatch` to assert a match. :keyword int,float timeout: The amount of time to wait for the events, in seconds. :keyword ~datetime.datetime,float after_time: After which time to start matching events. :return: An instance of :py:class:`~saltfactories.plugins.event_listener.MatchedEvents`. :rtype ~saltfactories.plugins.event_listener.MatchedEvents: """ if after_time is None: after_time = datetime.utcnow() elif isinstance(after_time, float): after_time = datetime.utcfromtimestamp(after_time) after_time_iso = after_time.isoformat() log.debug( "%s is waiting for event patterns happening after %s: %s", self, after_time_iso, set(patterns), ) found_events = set() patterns = set(patterns) timeout_at = time.time() + timeout while True: if not patterns: return True for event in copy.copy(self.store): if event.expired: # Too old, carry on continue if event.stamp < after_time: continue for pattern in set(patterns): _daemon_id, _pattern = pattern if event.daemon_id != _daemon_id: continue if fnmatch.fnmatch(event.tag, _pattern): log.debug("%s Found matching pattern: %s", self, pattern) found_events.add(event) patterns.remove((event.daemon_id, _pattern)) if not patterns: break if time.time() > timeout_at: break time.sleep(0.5) return MatchedEvents(matches=found_events, missed=patterns)
def _cleanup(self): cleanup_at = time.time() + 30 while self.running_event.is_set(): if time.time() < cleanup_at: time.sleep(1) continue # Reset cleanup time cleanup_at = time.time() + 30 # Cleanup expired events to_remove = [] for event in self.store: if event.expired: to_remove.append(event) for event in to_remove: log.debug("%s Removing from event store: %s", self, event) self.store.remove(event) log.debug("%s store size after cleanup: %s", self, len(self.store))
def run_start_checks(self, started_at, timeout_at): """ Run checks to confirm that the daemon has started. """ if not super().run_start_checks(started_at, timeout_at): return False if not self.event_listener: # pragma: no cover log.debug( "The 'event_listener' attribute is not set. Not checking events..." ) return True check_events = set(self.get_check_events()) if not check_events: log.debug("No events to listen to for %s", self) return True log.debug("Events to check for %s: %s", self, set(self.get_check_events())) checks_start_time = time.time() while time.time() <= timeout_at: if not self.is_running(): raise FactoryNotStarted("{} is no longer running".format(self)) if not check_events: break check_events -= {(event.daemon_id, event.tag) for event in self.event_listener.get_events( check_events, after_time=started_at)} if check_events: time.sleep(1.5) else: log.error( "Failed to check events after %1.2f seconds for %s. Remaining events to check: %s", time.time() - checks_start_time, self, check_events, ) return False log.debug("All events checked for %s: %s", self, set(self.get_check_events())) return True
def start(self, *command, max_start_attempts=None, start_timeout=None): """ Start the container. """ if self.is_running(): log.warning("%s is already running.", self) return True self._terminate_result = None atexit.register(self.terminate) factory_started = False for callback, args, kwargs in self._before_start_callbacks: try: callback(*args, **kwargs) except Exception as exc: # pragma: no cover pylint: disable=broad-except log.info( "Exception raised when running %s: %s", format_callback_to_string(callback, args, kwargs), exc, exc_info=True, ) start_time = time.time() start_attempts = max_start_attempts or self.max_start_attempts current_attempt = 0 while current_attempt <= start_attempts: current_attempt += 1 if factory_started: break log.info("Starting %s. Attempt: %d of %d", self, current_attempt, start_attempts) current_start_time = time.time() start_running_timeout = current_start_time + (start_timeout or self.start_timeout) # Start the container self.container = self.docker_client.containers.run( self.image, name=self.name, detach=True, stdin_open=True, command=list(command) or None, **self.container_run_kwargs, ) while time.time() <= start_running_timeout: # Don't know why, but if self.container wasn't previously in a running # state, and now it is, we have to re-set the self.container attribute # so that it gives valid status information self.container = self.docker_client.containers.get(self.name) if self.container.status != "running": time.sleep(0.25) continue self.container = self.docker_client.containers.get(self.name) logs = self.container.logs(stdout=True, stderr=True, stream=False) if isinstance(logs, bytes): stdout = logs.decode() stderr = None else: stdout = logs[0].decode() stderr = logs[1].decode() if stdout and stderr: log.info("Running Container Logs:\n%s\n%s", stdout, stderr) elif stdout: log.info("Running Container Logs:\n%s", stdout) # If we reached this far it means that we got the running status above, and # now that the container has started, run start checks try: if ( self.run_container_start_checks(current_start_time, start_running_timeout) is False ): time.sleep(0.5) continue except FactoryNotStarted: self.terminate() break log.info( "The %s factory is running after %d attempts. Took %1.2f seconds", self, current_attempt, time.time() - start_time, ) factory_started = True break else: # We reached start_running_timeout, re-try try: self.container.remove(force=True) self.container.wait() except APIError: pass self.container = None else: # The factory failed to confirm it's running status self.terminate() if factory_started: for callback, args, kwargs in self._after_start_callbacks: try: callback(*args, **kwargs) except Exception as exc: # pragma: no cover pylint: disable=broad-except log.info( "Exception raised when running %s: %s", format_callback_to_string(callback, args, kwargs), exc, exc_info=True, ) # TODO: Add containers to the processes stats?! # if self.factories_manager and self.factories_manager.stats_processes is not None: # self.factories_manager.stats_processes[self.get_display_name()] = psutil.Process( # self.pid # ) return factory_started result = self.terminate() raise FactoryNotStarted( "The {} factory has failed to confirm running status after {} attempts, which " "took {:.2f} seconds({:.2f} seconds each)".format( self, current_attempt - 1, time.time() - start_time, start_timeout or self.start_timeout, ), process_result=result, )
def process_logs(self): """ Process the logs returned. """ address = "tcp://{}:{}".format(self.log_host, self.log_port) context = zmq.Context() puller = context.socket(zmq.PULL) puller.set_hwm(self.socket_hwm) exit_timeout_seconds = 5 exit_timeout = None try: puller.bind(address) except zmq.ZMQError: # pragma: no cover log.exception("%s Unable to bind to puller at %s", self, address) return try: self.running_event.set() poller = zmq.Poller() poller.register(puller, zmq.POLLIN) while True: if not self.running_event.is_set(): if exit_timeout is None: log.debug( "%s Waiting %d seconds to process any remaning log messages " "before exiting...", self, exit_timeout_seconds, ) exit_timeout = time.time() + exit_timeout_seconds if time.time() >= exit_timeout: log.debug( "%s Unable to process remaining log messages in time. Exiting anyway.", self, ) break try: if not poller.poll(1000): continue msg = puller.recv() if msgpack.version >= (0, 5, 2): record_dict = msgpack.loads(msg, raw=False) else: # pragma: no cover record_dict = msgpack.loads(msg, encoding="utf-8") if record_dict is None: # A sentinel to stop processing the queue log.info("%s Received the sentinel to shutdown", self) self.sentinel_event.set() break try: record_dict["message"] except KeyError: # pragma: no cover # This log record was msgpack dumped from Py2 for key, value in record_dict.copy().items(): skip_update = True if isinstance(value, bytes): value = value.decode("utf-8") skip_update = False if isinstance(key, bytes): key = key.decode("utf-8") skip_update = False if skip_update is False: record_dict[key] = value # Just log everything, filtering will happen on the main process # logging handlers record = logging.makeLogRecord(record_dict) logger = logging.getLogger(record.name) logger.handle(record) except (EOFError, KeyboardInterrupt, SystemExit): # pragma: no cover break except Exception as exc: # pragma: no cover pylint: disable=broad-except log.warning( "%s An exception occurred in the processing queue thread: %s", self, exc, exc_info=True, ) finally: puller.close(1) context.term() log.debug("%s Process log thread terminated", self)