def monitor_node_latencies(): """Continously emits latency metric for other nodes by pinging them.""" ID = int(os.getenv("ID")) nodes = get_nodes() other_nodes = { k: nodes[k] for k in nodes if k != ID and nodes[k].hostname != "localhost" } if len(other_nodes) == 0: logger.info(f"No use to ping when running locally, aborting") return while True: nodes = get_nodes() other_nodes = { k: nodes[k] for k in nodes if k != ID and nodes[k].hostname != "localhost" } for n_id in other_nodes: node = nodes[n_id] try: cmd = f"sudo sh ./metrics/ping.sh {node.hostname}".split(" ") res = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate() latency = float(res[0].decode().replace("\n", "")) host_latency.labels(ID, nodes[ID].hostname, n_id, node.hostname).set(latency) except Exception as e: logger.error(f"Got error {e} when pinging {node.hostname}")
def setup_communication(resolver): """Sets up the communication using asyncio event loop.""" nodes = config.get_nodes() # setup receiver to receiver channel messages from other nodes receiver = Receiver(id, nodes[id].ip, nodes[id].port, resolver, resolver.on_message_sent) t = Thread(target=receiver.start) t.start() # setup sender channel to other nodes senders = {} for _, node in nodes.items(): if id != node.id: sender = Sender(id, node, resolver.on_message_sent) senders[node.id] = sender logger.info("All senders connected") resolver.senders = senders resolver.receiver = receiver loop = asyncio.get_event_loop() for i in senders: loop.create_task(senders[i].start()) resolver.system_status = SystemStatus.READY loop.run_forever() loop.close()
def setup_fd_communication(resolver): """Sets up the self-stabilizing communication for the failure detectors.""" nodes = config.get_nodes() # setup self-stabilizing receiver channel for failure detectors on # other nodes receiver = FDReceiver(("0.0.0.0", 7000 + id), on_message_recv=resolver.dispatch_msg) t = Thread(target=receiver.listen) t.start() # setup self-stabilizing sender channels for failure detectors for # other nodes senders = {} for _, node in nodes.items(): if id != node.id: sender = FDSender(id, (node.hostname, 7000 + node.id), check_ready=resolver.system_running, on_message_sent=resolver.on_message_sent) senders[node.id] = sender t = Thread(target=sender.start) t.start() # inject to resolver resolver.fd_senders = senders resolver.fd_receiver = receiver logger.info("All self-stab UDP senders connected")
def refresh(self, new_node): """Called by API when a new node has been added to the system. Sets up communication channels etc. """ self.nodes = get_nodes() # update modules self.modules[Module.RECMA_MODULE].number_of_nodes = len(self.nodes) self.modules[Module.RECSA_MODULE].number_of_nodes = len(self.nodes) self.modules[Module.FAILURE_DETECTOR_MODULE].number_of_nodes = len( self.nodes) self.modules[Module.FAILURE_DETECTOR_MODULE].beat += [0] self.modules[Module.JOINING_MECHANISM_MODULE].number_of_nodes = len( self.nodes) Thread(target=self.run_sender_in_new_thread, args=(new_node, )).start() # set up new fd sender new_fd_sender = FDSender(id, (new_node.hostname, 7000 + new_node.id), check_ready=self.system_running, on_message_sent=self.on_message_sent) self.fd_senders[new_node.id] = new_fd_sender Thread(target=self.fd_senders[new_node.id].start).start() logger.info(f"System refreshed, now {len(self.nodes)} nodes in system")
def run(self, testing=False): """Called whenever the module is launched in a separate thread.""" # block until system is ready while not testing and not self.resolver.system_running(): time.sleep(0.1) while True: if self.msg_queue.empty(): time.sleep(0.1) else: msg = self.msg_queue.get() processor_j = msg["sender"] self.upon_token_from_pj(processor_j) self.send_msg(processor_j) if testing: break if self.first_run: nodes = conf.get_nodes() for node_j, _ in nodes.items(): if node_j != self.id: self.send_msg(node_j) self.first_run = False throttle()
def get_nodes_list(): """Returns a list of all nodes in the system. Used by joining script to have a new node join the system. """ nodes = conf.get_nodes() ns = {} for n_id, n in nodes.items(): ns[n_id] = n.to_dct() return jsonify(ns)
def fetch_data_for_all_nodes(): """Fetches data from all nodes through their /data endpoint.""" try: data = [] for _, node in conf.get_nodes().items(): r = requests.get(f"http://{node.ip}:{4000+node.id}/data") data.append({"node": node.to_dct(), "data": r.json()}) return data except Exception as e: logger.error(f"Error when fetching data for other nodes: {e}") return None
def test_config_can_parse_nodes_txt(self): s = "0,localhost,127.0.0.1,5000\n1,localhost,127.0.0.1,5001\n" path = "conf/tmp.txt" with open(path, "w") as f: f.write(s) hosts = config.get_nodes(hosts_path=path) self.assertEqual(len(hosts.values()), 2) self.assertEqual(hosts[0].id, 0) self.assertEqual(hosts[0].hostname, "localhost") self.assertEqual(hosts[0].ip, "127.0.0.1") self.assertEqual(hosts[0].port, 5000) self.assertEqual(hosts[1].id, 1) os.remove(path)
def __init__(self, testing=False): """Initializes the resolver.""" self.modules = None self.senders = {} self.fd_senders = {} self.receiver = None self.fd_receiver = None self.nodes = get_nodes() # locks used to avoid race conditions with modules self.view_est_lock = Lock() self.replication_lock = Lock() self.prim_mon_lock = Lock() self.own_comm_ready = False self.other_comm_ready = False self.system_status = SystemStatus.BOOTING # check other nodes for system ready before starting system if not testing: t = Thread(target=self.wait_for_other_nodes) t.start() # inject resolver in rate limiter module rate_limiter.resolver = self # Support non-self-stabilizing mode self.self_stab = os.getenv("NON_SELF_STAB") is None # metrics self.total_msgs_sent = 0 self.view_est_msgs = 0 self.view_est_bytes = 0 self.rep_msgs = 0 self.rep_bytes = 0 self.prim_mon_msgs = 0 self.prim_mon_bytes = 0 self.fd_msgs = 0 self.fd_bytes = 0 self.total_bytes_sent = 0 self.experiment_started = False
def __init__(self, testing=False): """Initializes the resolver.""" self.modules = None self.senders = {} self.fd_senders = {} self.receiver = None self.fd_receiver = None self.nodes = get_nodes() self.own_comm_ready = False self.other_comm_ready = False self.system_status = SystemStatus.BOOTING # check other nodes for system ready before starting system if not testing: t = Thread(target=self.wait_for_other_nodes) t.start() # inject resolver in rate limiter module rate_limiter.resolver = self # Support non-self-stabilizing mode self.self_stab = os.getenv("NON_SELF_STAB") is None
def send_msg(self): """Method description. Calls the Resolver to send a message containing the phase, view and witnesses of processor i and what processor wants to echo about processor j to processor_j """ # stay silent if node configured to be unresponsive if byz.is_byzantine() and byz.get_byz_behavior() == byz.UNRESPONSIVE: return nodes = conf.get_nodes() for node_j, _ in nodes.items(): # update own echo instead of sending message if node_j == self.id: predicate_info = self.pred_and_action.get_info(self.id) self.echo[self.id] = { VIEWS: predicate_info[0], PHASE: self.phs[self.id], WITNESSES: self.witnesses[self.id], VCHANGE: predicate_info[1] } else: # node_i's own data pred_and_action_own_data = self.pred_and_action.get_info( self.id) own_data = [ deepcopy(self.phs[self.id]), deepcopy(self.witnesses[self.id]), deepcopy(pred_and_action_own_data[0]), deepcopy(pred_and_action_own_data[1]) ] pred_and_action_about_data = self.pred_and_action.get_info( node_j) # what node_i thinks about node_j about_data = [ deepcopy(self.phs[node_j]), deepcopy(self.witnesses[node_j]), deepcopy(pred_and_action_about_data[0]), deepcopy(pred_and_action_about_data[1]) ] # Overwriting own_data to send different views to different # nodes, to trick them # if acting Byzantine with different_views - behaviour if byz.is_byzantine(): if byz.get_byz_behavior() == byz.DIFFERENT_VIEWS: if (node_j % 2 == 0): own_data = [0, True, {CURRENT: 1, NEXT: 1}, False] else: own_data = [0, True, {CURRENT: 2, NEXT: 2}, False] elif byz.get_byz_behavior() == byz.FORCING_RESET: own_data = [ 0, True, self.pred_and_action.RST_PAIR, False ] msg = { "type": MessageType.VIEW_ESTABLISHMENT_MESSAGE, "sender": self.id, "data": { "own_data": deepcopy(own_data), "about_data": deepcopy(about_data) } } self.resolver.send_to_node(node_j, msg)