def check_exit(self): if self.do_exit: self.xspub.end("__server__") self.xs2server.join() self.heartbeat_thread.join() self.ws_server.stop() self.web_server.stop() cancel_async_tasks() del self.xserver elif self.hbeat > 4: self.hbeat = 0 service_manager = ServiceManager() services_list = service_manager.list() started_services = [] for name, svc in services_list.items(): if svc['state'] >= service_manager.STARTING: started_services.append(name) for service in started_services: msg = json.dumps({ '__heartbeat__': service, 'id': 'hbeat_' + str(self.hbeat_id) }) service_manager.send(service, 0, np.zeros(1), msg) self.hbeat_id = (self.hbeat_id + 1) % 9999 else: self.hbeat += 1
def xstream2server(): xs = xstream.Subscribe("__server__") while True: # subscribe to special "__server__" channel for # other processes to send messages to this server # e.g. speedodata -> websockets msg_str = xs.get_msg() if msg_str is None: break try: msg = json.loads(msg_str) if msg['topic'] == 'speedodata': WebSocketHandler.broadcast(msg['topic'], msg['message']) elif msg['topic'] == 'callback' and 'callback_id' in msg: # print("sending callback message") # print(msg['message']) WebSocketHandler.send_to_client(\ msg['callback_id'], msg['topic'], msg['message']) elif msg['topic'] == 'xs_throughput': report = json.loads(msg['message']) #print(report) for name, throughput in report.items(): serviceName = name.split('.')[0] edgeName = name[name.find('.') + 1:] ServiceManager().update_throughput_stats( serviceName, edgeName, throughput) except: pass cancel_async_tasks()
def heartbeat(stop): xs = xstream.Subscribe("__heartbeat__", timeout=5000) service_manager = ServiceManager() node_status = {} def check_services(node_status): if stop: return invalid_services = [] for service, status in node_status.items(): last_valid = status['last_valid'] service_state = service_manager._services[service]['state'] is_starting = service_state == service_manager.STARTING is_started = service_state == service_manager.STARTED # if the service has been stopped, clear it if service_state == service_manager.STOPPED: invalid_services.append(service) # if there's a discrepancy in what the service_manager says # and what we have cached, clear it elif is_starting and node_status[service]['is_started']: invalid_services.append(service) # if it's started and hasn't been valid in the last n secs, # restart it elif is_started and now - last_valid > 5: logger.warning("Service %s is dead, restarting" % service) service_manager.stop(service) service_manager.start(service) node_status[service]['is_started'] = False for service in invalid_services: del node_status[service] logger = logging.getLogger(__name__) while True: if stop(): break # when enabling coverage, this line will raise an exception for some # reason. For now, just catching it try: msg_str = xs.get_msg() now = time.time() except Exception: logger.exception("Shouldn't happen") # the get_msg timed out, i.e. no heartbeats received if msg_str == (None, None): check_services(node_status) continue msg = json.loads(msg_str) service = msg['service'] channel = msg['channel'] # if this is the first time we've seen this service if service not in node_status: _first_edge, last_edge = service_manager._get_graph_io(service) node_status[service] = { 'last_valid': 0, # saves the last time this service was valid 'is_started': False, # our check that services haven't stopped 'last_edge': last_edge[0], # saves the last edge of the service 'channels': {} # save heartbeat times for each channel } node_status[service]['channels'][channel] = now service_state = service_manager._services[service]['state'] if node_status[service]['last_edge'] == channel: if service_state == service_manager.STARTING: if not node_status[service]['is_started']: service_manager._services[service][ 'state'] = service_manager.STARTED node_status[service]['is_started'] = True else: # there's a discrepancy. For example, the service may # have been stopped and something else started with # the same name. In this case, clear the cache del node_status[service] continue node_status[service]['last_valid'] = now check_services(node_status) cancel_async_tasks()