def send_msg_to_client(self, client_ip, client_port, status, sock, write_req_id): reply_data = {} reply_data['status'] = status #can be +1 or -1 reply_msg = Message(Msg_type['write_reply'], recv_host=client_ip, recv_port=client_port, data_dict=reply_data) reply_msg._msg_id = (self.node_id, threading.current_thread().ident) send_msg(sock, reply_msg) #close the socket self.inputs.remove(sock) sock.close() #clean dicts self.clear_write_req_data(write_req_id) return
def heartbeat_thread_fn(self): ''' Does all processes related to heartbeat receiving and sending ''' self.pause_heartbeat = False self.thread_msg_qs[threading.current_thread().ident] = queue.Queue() heartbeat_msg = Message(Msg_type['heartbeat'], msg_id=(self.node_id, threading.current_thread().ident)) # dict of type [node_id : count of time-outs] node_timeouts = {n_id: -1 for n_id in self.network_dict.keys() } # initiate time-out counts while True: # for a leader node if self.is_leader: if self.pause_heartbeat: continue responded_nodes = [] # Collect all messages from queue: q = self.thread_msg_qs[threading.current_thread().ident] while not q.empty(): hmsg = q.get() responded_nodes.append(hmsg._msg_id[0]) # correct time-out counts for n_id in self.network_dict.keys(): if n_id not in responded_nodes: try: node_timeouts[n_id] += 1 except: node_timeouts[n_id] = 1 else: node_timeouts[n_id] = 0 # Check if someone has not responded for long: to_del = [] for n_id in self.network_dict.keys(): if node_timeouts[n_id] >= self.timeout_thresh: print("NODE : ", n_id, " found unresponsive") # TODO: what now? - initiate node deletion phase to_del.append(n_id) # delete in self and send to all for n_id in to_del: try: del self.network_dict[n_id] except: pass try: del node_timeouts[n_id] except: pass for n_to_delete in to_del: del_msg = Message( Msg_type['delete_node'], msg_id=(self.node_id, threading.current_thread().ident)) for n_id in self.network_dict: new_recv = (self.network_dict[n_id][0], self.network_dict[n_id][1]) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: s.connect(new_recv) except: pass else: del_msg._source_host, del_msg._source_port = s.getsockname( ) del_msg._recv_host, del_msg._recv_port = new_recv del_msg._msg_id = ( self.node_id, threading.current_thread().ident) del_msg._data_dict = {'id': n_to_delete} send_msg(s, del_msg) # Send a heartbeat to everyone and start a timer for n_id in self.network_dict.keys(): # send messages to all using temporary port with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: s.connect((self.network_dict[n_id][0], self.network_dict[n_id][1])) except: pass else: heartbeat_msg._source_host, heartbeat_msg._source_port = s.getsockname( ) heartbeat_msg._recv_host, heartbeat_msg._recv_port, state = self.network_dict[ n_id] heartbeat_msg._msg_id = ( self.node_id, threading.current_thread().ident) heartbeat_msg._data_dict = {} send_msg(s, heartbeat_msg) # re-starting timer time.sleep(self.heartbeat_delay) # for a non-leader node else: if self.pause_heartbeat: continue got_ldr_hbeat = False q = self.thread_msg_qs[threading.current_thread().ident] while not q.empty(): hmsg = q.get() if ((hmsg.get_data('type') is not None) and (hmsg.get_data('type') == 'reply')): continue self.ldr_timeout_count = 0 # reply to heartbeat with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: hbeat_id = hmsg._msg_id[0] if not (hbeat_id == self.node_id): hmsg_ip, hmsg_port, state = self.network_dict[ hbeat_id] else: hmsg_ip, hmsg_port, state = (self.HOST, self.PORT, 1) if hbeat_id == self.ldr_id: got_ldr_hbeat = True try: s.connect((hmsg_ip, hmsg_port)) except: pass else: heartbeat_msg._source_host, heartbeat_msg._source_port = s.getsockname( ) heartbeat_msg._recv_host, heartbeat_msg._recv_port = ( hmsg_ip, hmsg_port) heartbeat_msg._msg_id = ( self.node_id, threading.current_thread().ident) heartbeat_msg._data_dict = {'type': 'reply'} send_msg(s, heartbeat_msg) if self.ldr_alive: if not got_ldr_hbeat: self.ldr_timeout_count += 1 else: self.ldr_timeout_count = 0 # check if leader has failed self.ldr_stat_lock.acquire() if self.ldr_timeout_count >= self.timeout_thresh: self.ldr_timeout_count = 0 print("Leader failure detected") self.ldr_alive = False try: del self.network_dict[self.ldr_id] except: pass leader_elect_thread = threading.Thread( target=self.ldrelect_thread_fn, args=()) leader_elect_thread.start() self.ldr_elect_tid = leader_elect_thread.ident self.ldr_stat_lock.release() # re-rstarting timer time.sleep(self.ldr_heartbeat_delay)
def ldrelect_thread_fn(self): """ Tasked with the selection of the new leader """ # TODO: delete its entry from everywhere while exiting print("DEBUG_MSG: Leader Election started ") self.thread_msg_qs[threading.get_ident()] = queue.Queue() heartbeat_msg = Message(Msg_type['heartbeat']) has_leader = False nodes = list(self.network_dict.keys()) nodes.append(self.node_id) while not has_leader and not self.ldr_alive: nodes = sorted(nodes) print(nodes) # if this is itself the smallest id node if nodes[0] == self.node_id: msg = Message(Msg_type['ldr_proposal']) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: s.connect((self.HOST, self.PORT)) except: pass # go to outer while loop and re-start process else: msg._source_host, msg._source_port = s.getsockname() msg._recv_host, msg._recv_port = (self.HOST, self.PORT) msg._msg_id = (self.node_id, threading.current_thread().ident) # assume that beyond this point, the found node stays alive... # ... or, this thread begins later again or in some other node has_leader = True send_msg(s, msg) # clear its existence before exiting self.ldr_elect_tid = None self.thread_msg_qs.pop(threading.get_ident(), None) return for n_id in nodes: if n_id == self.node_id: continue print("DEBUG_MSG: sending heartbeat from ldr_elect to: ", n_id) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: s.connect((self.network_dict[n_id][0], self.network_dict[n_id][1])) except: pass else: heartbeat_msg._source_host, heartbeat_msg._source_port = s.getsockname( ) heartbeat_msg._recv_host, heartbeat_msg._recv_port, status = self.network_dict[ n_id] heartbeat_msg._msg_id = (self.node_id, threading.current_thread().ident) send_msg(s, heartbeat_msg) # now, the coordinator is responsible to pass the heartbeat messages into this thread # wait for timeout amount of time before deciding which all are alive # TODO: need to wait for multiple time-outs? time.sleep(self.heartbeat_delay * self.timeout_thresh) responded_nodes = set([self.node_id]) q = self.thread_msg_qs[threading.current_thread().ident] while not q.empty(): msg = q.get() responded_nodes.add(msg._msg_id[0]) print("DEBUG_MSG: responded_nodes: ", responded_nodes) prospective_ldr = min(responded_nodes) msg = Message(Msg_type['ldr_proposal']) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: if not (prospective_ldr == self.node_id): new_recv = (self.network_dict[prospective_ldr][0], self.network_dict[prospective_ldr][1]) else: new_recv = (self.HOST, self.PORT) try: s.connect(new_recv) except: pass # go to outer while loop and re-start process else: msg._source_host, msg._source_port = s.getsockname() msg._recv_host, msg._recv_port = new_recv msg._msg_id = (self.node_id, threading.current_thread().ident) # assume that beyond this point, the found node stays alive... # ... or, this thread begins later again or in some other node has_leader = True send_msg(s, msg) # clear its existence before exiting self.ldr_elect_tid = None self.thread_msg_qs.pop(threading.get_ident(), None)