async def _monitor_loop(self): """async Monitor loop to monitor registered services""" # check health of all micro-services every N seconds round_cnt = 0 check_count = {} # dict to hold current count of current status. # In case of ok and running status, count will always be 1. # In case of of non running statuses, count shows since when this status is set. while True: round_cnt += 1 self._logger.debug( "Starting next round#{} of service monitoring, sleep/i:{} ping/t:{} max/a:{}" .format(round_cnt, self._sleep_interval, self._ping_timeout, self._max_attempts)) for service_record in ServiceRegistry.all(): if service_record._id not in check_count: check_count.update({service_record._id: 1}) # Try ping if service status is either running or doubtful (i.e. give service a chance to recover) if service_record._status not in [ ServiceRecord.Status.Running, ServiceRecord.Status.Unresponsive, ServiceRecord.Status.Failed ]: continue self._logger.debug("Service: {} Status: {}".format( service_record._name, service_record._status)) if service_record._status == ServiceRecord.Status.Failed: if self._restart_failed == "auto": if service_record._id not in self.restarted_services: self.restarted_services.append(service_record._id) asyncio.ensure_future( self.restart_service(service_record)) continue try: url = "{}://{}:{}/foglamp/service/ping".format( service_record._protocol, service_record._address, service_record._management_port) async with aiohttp.ClientSession() as session: async with session.get( url, timeout=self._ping_timeout) as resp: text = await resp.text() res = json.loads(text) if res["uptime"] is None: raise ValueError('res.uptime is None') except (asyncio.TimeoutError, aiohttp.client_exceptions.ServerTimeoutError) as ex: service_record._status = ServiceRecord.Status.Unresponsive check_count[service_record._id] += 1 self._logger.info("ServerTimeoutError: %s, %s", str(ex), service_record.__repr__()) except aiohttp.client_exceptions.ClientConnectorError as ex: service_record._status = ServiceRecord.Status.Unresponsive check_count[service_record._id] += 1 self._logger.info("ClientConnectorError: %s, %s", str(ex), service_record.__repr__()) except ValueError as ex: service_record._status = ServiceRecord.Status.Unresponsive check_count[service_record._id] += 1 self._logger.info("Invalid response: %s, %s", str(ex), service_record.__repr__()) except Exception as ex: service_record._status = ServiceRecord.Status.Unresponsive check_count[service_record._id] += 1 self._logger.info("Exception occurred: %s, %s", str(ex), service_record.__repr__()) else: service_record._status = ServiceRecord.Status.Running check_count[service_record._id] = 1 if check_count[service_record._id] > self._max_attempts: ServiceRegistry.mark_as_failed(service_record._id) check_count[service_record._id] = 0 try: audit = AuditLogger(connect.get_storage_async()) await audit.failure('SRVFL', {'name': service_record._name}) except Exception as ex: self._logger.info("Failed to audit service failure %s", str(ex)) await self._sleep(self._sleep_interval)
async def test_get_health(self, mocker, client): # empty service registry resp = await client.get('/foglamp/service') assert 200 == resp.status result = await resp.text() json_response = json.loads(result) assert {'services': []} == json_response mocker.patch.object(InterestRegistry, "__init__", return_value=None) mocker.patch.object(InterestRegistry, "get", return_value=list()) with patch.object(ServiceRegistry._logger, 'info') as log_patch_info: # populated service registry ServiceRegistry.register('name1', 'Storage', 'address1', 1, 1, 'protocol1') ServiceRegistry.register('name2', 'Southbound', 'address2', 2, 2, 'protocol2') s_id_3 = ServiceRegistry.register('name3', 'Southbound', 'address3', 3, 3, 'protocol3') s_id_4 = ServiceRegistry.register('name4', 'Southbound', 'address4', 4, 4, 'protocol4') ServiceRegistry.unregister(s_id_3) ServiceRegistry.mark_as_failed(s_id_4) resp = await client.get('/foglamp/service') assert 200 == resp.status result = await resp.text() json_response = json.loads(result) assert json_response == { 'services': [{ 'type': 'Storage', 'service_port': 1, 'address': 'address1', 'protocol': 'protocol1', 'status': 'running', 'name': 'name1', 'management_port': 1 }, { 'type': 'Southbound', 'service_port': 2, 'address': 'address2', 'protocol': 'protocol2', 'status': 'running', 'name': 'name2', 'management_port': 2 }, { 'type': 'Southbound', 'service_port': 3, 'address': 'address3', 'protocol': 'protocol3', 'status': 'shutdown', 'name': 'name3', 'management_port': 3 }, { 'type': 'Southbound', 'service_port': 4, 'address': 'address4', 'protocol': 'protocol4', 'status': 'failed', 'name': 'name4', 'management_port': 4 }] } assert 6 == log_patch_info.call_count
async def _monitor_loop(self): """async Monitor loop to monitor registered services""" # check health of all micro-services every N seconds round_cnt = 0 check_count = {} # dict to hold current count of current status. # In case of ok and running status, count will always be 1. # In case of of non running statuses, count shows since when this status is set. while True: round_cnt += 1 self._logger.debug( "Starting next round#{} of service monitoring, sleep/i:{} ping/t:{} max/a:{}" .format(round_cnt, self._sleep_interval, self._ping_timeout, self._max_attempts)) for service_record in ServiceRegistry.all(): if service_record._id not in check_count: check_count.update({service_record._id: 1}) # Try ping if service status is either running or doubtful (i.e. give service a chance to recover) if service_record._status not in [ ServiceRecord.Status.Running, ServiceRecord.Status.Unresponsive ]: continue try: url = "{}://{}:{}/foglamp/service/ping".format( service_record._protocol, service_record._address, service_record._management_port) async with aiohttp.ClientSession() as session: async with session.get( url, timeout=self._ping_timeout) as resp: text = await resp.text() res = json.loads(text) if res["uptime"] is None: raise ValueError('Improper Response') except ValueError: service_record._status = ServiceRecord.Status.Unresponsive check_count[service_record._id] += 1 self._logger.info("Marked as doubtful micro-service %s", service_record.__repr__()) except Exception as ex: # TODO: Fix too broad exception clause # Fixme: Investigate as why no exception message can appear, e.g. Apr 16 15:32:08 nerd51-ThinkPad # FogLAMP[423] INFO: monitor: foglamp.services.core.service_registry.monitor: Exception occurred # during monitoring: if "" != str(ex).strip( ): # i.e. if a genuine exception occurred self._logger.info( "Exception occurred during monitoring: %s", str(ex)) service_record._status = ServiceRecord.Status.Unresponsive check_count[service_record._id] += 1 self._logger.info( "Marked as unresponsive micro-service %s", service_record.__repr__()) else: service_record._status = ServiceRecord.Status.Running check_count[service_record._id] = 1 if check_count[service_record._id] > self._max_attempts: ServiceRegistry.mark_as_failed(service_record._id) check_count[service_record._id] = 0 try: audit = AuditLogger(connect.get_storage()) await audit.failure('SRVFL', {'name': service_record._name}) except Exception as ex: self._logger.info("Failed to audit service failure %s", str(ex)) await self._sleep(self._sleep_interval)