Exemple #1
0
 def services_health_litmus_test():
     all_svc_status = [
         ServiceRecord.Status(int(service_record._status)).name.upper()
         for service_record in ServiceRegistry.all()
     ]
     if 'FAILED' in all_svc_status:
         return 'red'
     elif 'UNRESPONSIVE' in all_svc_status:
         return 'amber'
     return 'green'
Exemple #2
0
def get_service_records():
    sr_list = list()
    for service_record in ServiceRegistry.all():
        sr_list.append(
            {
                'name': service_record._name,
                'type': service_record._type,
                'address': service_record._address,
                'management_port': service_record._management_port,
                'service_port': service_record._port,
                'protocol': service_record._protocol,
                'status': ServiceRecord.Status(int(service_record._status)).name.lower()
            })
    recs = {'services': sr_list}
    return recs
Exemple #3
0
    async def _monitor_loop(self):
        """async Monitor loop to monitor registered services"""
        # check health of all micro-services every N seconds
        round_cnt = 0
        check_count = {}  # dict to hold current count of current status.
        # In case of ok and running status, count will always be 1.
        # In case of of non running statuses, count shows since when this status is set.
        while True:
            round_cnt += 1
            self._logger.debug(
                "Starting next round#{} of service monitoring, sleep/i:{} ping/t:{} max/a:{}"
                .format(round_cnt, self._sleep_interval, self._ping_timeout,
                        self._max_attempts))
            for service_record in ServiceRegistry.all():
                if service_record._id not in check_count:
                    check_count.update({service_record._id: 1})

                # Try ping if service status is either running or doubtful (i.e. give service a chance to recover)
                if service_record._status not in [
                        ServiceRecord.Status.Running,
                        ServiceRecord.Status.Unresponsive,
                        ServiceRecord.Status.Failed
                ]:
                    continue

                self._logger.debug("Service: {} Status: {}".format(
                    service_record._name, service_record._status))

                if service_record._status == ServiceRecord.Status.Failed:
                    if self._restart_failed == "auto":
                        if service_record._id not in self.restarted_services:
                            self.restarted_services.append(service_record._id)
                            asyncio.ensure_future(
                                self.restart_service(service_record))
                    continue

                try:
                    url = "{}://{}:{}/fledge/service/ping".format(
                        service_record._protocol, service_record._address,
                        service_record._management_port)
                    async with aiohttp.ClientSession() as session:
                        async with session.get(
                                url, timeout=self._ping_timeout) as resp:
                            text = await resp.text()
                            res = json.loads(text)
                            if res["uptime"] is None:
                                raise ValueError('res.uptime is None')
                except (asyncio.TimeoutError,
                        aiohttp.client_exceptions.ServerTimeoutError) as ex:
                    service_record._status = ServiceRecord.Status.Unresponsive
                    check_count[service_record._id] += 1
                    self._logger.info("ServerTimeoutError: %s, %s", str(ex),
                                      service_record.__repr__())
                except aiohttp.client_exceptions.ClientConnectorError as ex:
                    service_record._status = ServiceRecord.Status.Unresponsive
                    check_count[service_record._id] += 1
                    self._logger.info("ClientConnectorError: %s, %s", str(ex),
                                      service_record.__repr__())
                except ValueError as ex:
                    service_record._status = ServiceRecord.Status.Unresponsive
                    check_count[service_record._id] += 1
                    self._logger.info("Invalid response: %s, %s", str(ex),
                                      service_record.__repr__())
                except Exception as ex:
                    service_record._status = ServiceRecord.Status.Unresponsive
                    check_count[service_record._id] += 1
                    self._logger.info("Exception occurred: %s, %s", str(ex),
                                      service_record.__repr__())
                else:
                    service_record._status = ServiceRecord.Status.Running
                    check_count[service_record._id] = 1

                if check_count[service_record._id] > self._max_attempts:
                    ServiceRegistry.mark_as_failed(service_record._id)
                    check_count[service_record._id] = 0
                    try:
                        audit = AuditLogger(connect.get_storage_async())
                        await audit.failure('SRVFL',
                                            {'name': service_record._name})
                    except Exception as ex:
                        self._logger.info("Failed to audit service failure %s",
                                          str(ex))
            await self._sleep(self._sleep_interval)