def services_health_litmus_test(): all_svc_status = [ ServiceRecord.Status(int(service_record._status)).name.upper() for service_record in ServiceRegistry.all() ] if 'FAILED' in all_svc_status: return 'red' elif 'UNRESPONSIVE' in all_svc_status: return 'amber' return 'green'
def get_service_records(): sr_list = list() for service_record in ServiceRegistry.all(): sr_list.append( { 'name': service_record._name, 'type': service_record._type, 'address': service_record._address, 'management_port': service_record._management_port, 'service_port': service_record._port, 'protocol': service_record._protocol, 'status': ServiceRecord.Status(int(service_record._status)).name.lower() }) recs = {'services': sr_list} return recs
async def _monitor_loop(self): """async Monitor loop to monitor registered services""" # check health of all micro-services every N seconds round_cnt = 0 check_count = {} # dict to hold current count of current status. # In case of ok and running status, count will always be 1. # In case of of non running statuses, count shows since when this status is set. while True: round_cnt += 1 self._logger.debug( "Starting next round#{} of service monitoring, sleep/i:{} ping/t:{} max/a:{}" .format(round_cnt, self._sleep_interval, self._ping_timeout, self._max_attempts)) for service_record in ServiceRegistry.all(): if service_record._id not in check_count: check_count.update({service_record._id: 1}) # Try ping if service status is either running or doubtful (i.e. give service a chance to recover) if service_record._status not in [ ServiceRecord.Status.Running, ServiceRecord.Status.Unresponsive, ServiceRecord.Status.Failed ]: continue self._logger.debug("Service: {} Status: {}".format( service_record._name, service_record._status)) if service_record._status == ServiceRecord.Status.Failed: if self._restart_failed == "auto": if service_record._id not in self.restarted_services: self.restarted_services.append(service_record._id) asyncio.ensure_future( self.restart_service(service_record)) continue try: url = "{}://{}:{}/fledge/service/ping".format( service_record._protocol, service_record._address, service_record._management_port) async with aiohttp.ClientSession() as session: async with session.get( url, timeout=self._ping_timeout) as resp: text = await resp.text() res = json.loads(text) if res["uptime"] is None: raise ValueError('res.uptime is None') except (asyncio.TimeoutError, aiohttp.client_exceptions.ServerTimeoutError) as ex: service_record._status = ServiceRecord.Status.Unresponsive check_count[service_record._id] += 1 self._logger.info("ServerTimeoutError: %s, %s", str(ex), service_record.__repr__()) except aiohttp.client_exceptions.ClientConnectorError as ex: service_record._status = ServiceRecord.Status.Unresponsive check_count[service_record._id] += 1 self._logger.info("ClientConnectorError: %s, %s", str(ex), service_record.__repr__()) except ValueError as ex: service_record._status = ServiceRecord.Status.Unresponsive check_count[service_record._id] += 1 self._logger.info("Invalid response: %s, %s", str(ex), service_record.__repr__()) except Exception as ex: service_record._status = ServiceRecord.Status.Unresponsive check_count[service_record._id] += 1 self._logger.info("Exception occurred: %s, %s", str(ex), service_record.__repr__()) else: service_record._status = ServiceRecord.Status.Running check_count[service_record._id] = 1 if check_count[service_record._id] > self._max_attempts: ServiceRegistry.mark_as_failed(service_record._id) check_count[service_record._id] = 0 try: audit = AuditLogger(connect.get_storage_async()) await audit.failure('SRVFL', {'name': service_record._name}) except Exception as ex: self._logger.info("Failed to audit service failure %s", str(ex)) await self._sleep(self._sleep_interval)