Ejemplo n.º 1
0
    def _workers_monitoring(self):
        """
        Worker fault tolerance logic. Check is based on:
        1) handler status
        2) heartbeat if available
        """
        previous_status = {"active": [], "inactive": [], "initializing": []}
        loop_interval = self.cfg.worker_heartbeat or 5  # seconds
        break_outer_loop = False

        while self.active:
            hosts_status = {"active": [], "inactive": [], "initializing": []}

            for worker in self._workers:
                status, reason = self._query_worker_status(worker)
                if status == "inactive":
                    with self._pool_lock:
                        if self.active and self.status.tag not in (
                            self.status.STOPPING,
                            self.status.STOPPED,
                        ):
                            if self._handle_inactive(worker, reason):
                                status = "active"
                        else:
                            # if pool is aborting/stopping, exit monitor
                            break_outer_loop = True
                            break

                hosts_status[status].append(worker)

            if break_outer_loop:
                break

            if hosts_status != previous_status:
                self.logger.info(
                    "%s Hosts status update", datetime.datetime.now()
                )
                self.logger.info(pprint.pformat(hosts_status))
                previous_status = hosts_status

            if (
                not hosts_status["active"]
                and not hosts_status["initializing"]
                and hosts_status["inactive"]
            ):
                self.logger.critical(
                    "All workers of {} are inactive.".format(self)
                )
                self.abort()
                break

            try:
                # For early finish of worker monitoring thread.
                wait_until_predicate(
                    lambda: not self.is_alive,
                    timeout=loop_interval,
                    interval=0.05,
                )
            except RuntimeError:
                break
Ejemplo n.º 2
0
    def _workers_monitoring(self):
        """
        Monitor the health of workers in a loop. Executes in a separate thread.
        """
        if not self.cfg.worker_heartbeat:
            raise RuntimeError(
                'Cannot monitor workers with no heartbeat configured.')

        monitor_started = time.time()
        loop_sleep = self.cfg.worker_heartbeat * self.cfg.heartbeats_miss_limit

        while self._loop_handler.is_alive():
            w_total = set()
            w_uninitialized = set()
            w_active = set()
            w_inactive = set()

            monitor_alive = time.time() - monitor_started
            init_window = monitor_alive <= self.cfg.heartbeat_init_window
            with self._pool_lock:
                for worker in self._workers:
                    if getattr(worker, 'handler', None):
                        self._workers_handler_monitoring(worker)
                    w_total.add(worker)
                    if not worker.active:
                        w_inactive.add(worker)
                    elif worker.last_heartbeat is None:
                        w_uninitialized.add(worker)
                        if not init_window:
                            self._deco_worker(
                                worker, 'Aborting {}, could not initialize.')
                    elif time.time() - worker.last_heartbeat > loop_sleep:
                        w_inactive.add(worker)
                        self._deco_worker(
                            worker, 'Aborting {}, failed to send heartbeats.')
                    else:
                        w_active.add(worker)

                if w_total:
                    if len(w_inactive) == len(w_total):
                        self.logger.critical(
                            'All workers of {} are inactive.'.format(self))
                        self.abort()
                        break

            try:
                # For early finish of worker monitoring thread.
                wait_until_predicate(lambda: not self._loop_handler.is_alive(),
                                     timeout=loop_sleep, interval=0.05)
            except RuntimeError:
                break
Ejemplo n.º 3
0
    def _workers_monitoring(self):
        if not self.cfg.worker_heartbeat:
            # No heartbeat means no fault tolerance for worker.
            return

        monitor_started = time.time()
        loop_sleep = self.cfg.worker_heartbeat * self.cfg.heartbeats_miss_limit

        while self._loop_handler.is_alive():
            w_total = set()
            w_uninitialized = set()
            w_active = set()
            w_inactive = set()

            monitor_alive = time.time() - monitor_started
            init_window = monitor_alive <= self.cfg.heartbeat_init_window
            with self._pool_lock:
                for worker in self._workers:
                    w_total.add(worker)
                    if not worker.active:
                        w_inactive.add(worker)
                    elif worker.last_heartbeat is None:
                        w_uninitialized.add(worker)
                        if not init_window:
                            self._deco_worker(
                                worker, 'Aborting {}, could not initialize.')
                    elif time.time() - worker.last_heartbeat > loop_sleep:
                        w_inactive.add(worker)
                        self._deco_worker(
                            worker, 'Aborting {}, failed to send heartbeats.')
                    else:
                        w_active.add(worker)

                if len(w_inactive) == self.cfg.size:
                    self.logger.critical(
                        'All workers of {} are inactive.'.format(self))
                    self.abort()
                    break
            try:
                # For early finish of worker monitoring thread.
                wait_until_predicate(lambda: not self._loop_handler.is_alive(),
                                     timeout=loop_sleep,
                                     interval=0.05)
            except RuntimeError:
                return
Ejemplo n.º 4
0
    def _workers_monitoring(self):
        """
        Worker fault tolerance logic. Check is based on:
        1) handler status
        2) heartbeat if available
        """
        previous_status = {'active': [], 'inactive': [], 'initializing': []}
        if self.cfg.worker_heartbeat:
            loop_interval = self.cfg.worker_heartbeat
        else:
            loop_interval = 5  # seconds

        while self.is_alive and self.active:

            hosts_status = {'active': [], 'inactive': [], 'initializing': []}
            with self._pool_lock:
                for worker in self._workers:
                    status = self._query_worker_status(worker)
                    hosts_status[status].append(worker)

                if hosts_status != previous_status:
                    self.logger.info('%s Hosts status update',
                                     datetime.datetime.now())
                    self.logger.info(pprint.pformat(hosts_status))
                    previous_status = hosts_status

                if not hosts_status['active'] \
                        and not hosts_status['initializing'] \
                        and hosts_status['inactive']:
                    self.logger.critical(
                        'All workers of {} are inactive.'.format(self))
                    self.abort()
                    break

            try:
                # For early finish of worker monitoring thread.
                wait_until_predicate(lambda: not self.is_alive,
                                     timeout=loop_interval,
                                     interval=0.05)
            except RuntimeError:
                break