def init_delete(self): if self.is_sponsor: print("Cannot delete Node. Currently a sponsor node") else: if self.is_leader: #Get next highest key and broadcast new_ldr_id. key_list = list(self.network_dict.keys()) key_list.sort() new_ldr_id = key_list[0] for n in self.network_dict: new_ldr_msg = Message(Msg_type['new_ldr_id'],msg_id = (self.node_id,threading.current_thread().ident)) new_ldr_msg._source_host,new_ldr_msg._source_port = self.HOST,self.PORT new_ldr_msg._recv_host,new_ldr_msg._recv_port = self.network_dict[n][0],self.network_dict[n][1] new_ldr_msg._data_dict = {'type':'del_ldr','id':new_ldr_id,'ip': self.network_dict[new_ldr_id][0] ,'port':self.network_dict[new_ldr_id][1]} with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as soc: soc.connect((new_ldr_msg._recv_host,new_ldr_msg._recv_port)) send_msg(soc,new_ldr_msg) #send message print("DEBUG_MSG: Sent new leader id: ", new_ldr_id) self.is_leader = False # print("---------") # print(new_ldr_msg.get_data('id')) # print(new_ldr_msg.get_data('ip')) # print(new_ldr_msg.get_data('port')) self.ldr_port = self.network_dict[new_ldr_id][1] self.ldr_ip = self.network_dict[new_ldr_id][0] time.sleep(1) #send delete_msg to leader and stop delete_msg = Message(Msg_type['delete_node'],msg_id = (self.node_id,threading.current_thread().ident)) delete_msg._source_host,delete_msg._source_port=self.HOST,self.PORT delete_msg._recv_host,delete_msg._recv_port = self.ldr_ip,self.ldr_port delete_msg._data_dict = {'id':self.node_id} with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as soc: soc.connect((self.ldr_ip,self.ldr_port)) send_msg(soc,delete_msg) #send message #stop time.sleep(1) os._exit(0)
def coordination_thread_fn(self, heartbeat_tid): print("Listening on port :", self.PORT) self.thread_msg_qs[threading.get_ident()] = queue.Queue() server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # prevents "already in use" errors server.setblocking(0) server.bind((self.HOST, self.PORT)) server.listen(5) self.inputs.append(server) outputs = [] message_queues = {} # message queue dict while self.inputs: readable, writable, exceptional = select.select( self.inputs, outputs, self.inputs) for s in readable: if s is server: # for new connections connection, client_address = s.accept() print("DEBUG_MSG: got connection request from: ", client_address) connection.setblocking(0) self.inputs.append(connection) print("DEBUG_MSG: Received connection request from: ", client_address) # creating a message queue for each connection message_queues[connection] = queue.Queue() else: # if some message has been received - be it in part msg = recv_msg(s) #server if msg: print("DEBUG_MSG: data received: Msg_type:", Msg_type(msg._m_type)) if msg._msg_id is not None: print("Message sender id: ", msg._msg_id[0]) msg._source_host = client_address[0] msg._source_port = client_address[1] # find message type and send to the right thread if Msg_type(msg._m_type) is Msg_type.heartbeat: self.thread_msg_qs[self.heartbeat_tid].put(msg) # also send the heartbeat to leader election thread if self.ldr_elect_tid is not None: #try: self.thread_msg_qs[self.ldr_elect_tid].put(msg) #except: # pass #Add try catch statements, as on returning early, dicts might get cleared resulting in illegal access elif Msg_type( msg._m_type ) is Msg_type.write_req: #can be received by any node (this message comes directly from client) #this message should have 'filedir', 'filename' and 'file' fields in it's _data_dict cond = threading.Condition() self.wrreq_id += 1 curr_wrreq_id = self.wrreq_id #DONE - pass socket identifier as arg of following func call : write_thread = threading.Thread( target=self.write_req_handler, args=(msg, cond, curr_wrreq_id, s)) write_thread.start() self.wrreq_conditions[curr_wrreq_id] = cond self.wrreq_tids[curr_wrreq_id] = write_thread.ident self.thread_msg_qs[ write_thread.ident] = queue.Queue() continue #we don't want this socket to close ###################################### elif Msg_type( msg._m_type ) is Msg_type.cons_req: #can be received by any node (this message comes from new leader for maintaining consistency) #this message should have 'filepath' fields in it's _data_dict cond = threading.Condition() self.wrreq_id += 1 curr_wrreq_id = self.wrreq_id #DONE - pass socket identifier as arg of following func call : write_thread = threading.Thread( target=self.cons_handler, args=(msg, cond, curr_wrreq_id, s)) write_thread.start() self.wrreq_conditions[curr_wrreq_id] = cond self.wrreq_tids[curr_wrreq_id] = write_thread.ident self.thread_msg_qs[ write_thread.ident] = queue.Queue() continue #we don't want this socket to close ####################################### elif Msg_type( msg._m_type ) is Msg_type.cons_req: #can be received by any node (this message comes from new leader for maintaining consistency) #this message should have 'filepath' fields in it's _data_dict cond = threading.Condition() self.wrreq_id += 1 curr_wrreq_id = self.wrreq_id #DONE - pass socket identifier as arg of following func call : write_thread = threading.Thread( target=self.cons_handler, args=(msg, cond, curr_wrreq_id, s)) write_thread.start() self.wrreq_conditions[curr_wrreq_id] = cond self.wrreq_tids[curr_wrreq_id] = write_thread.ident self.thread_msg_qs[ write_thread.ident] = queue.Queue() continue #we don't want this socket to close elif Msg_type( msg._m_type ) is Msg_type.WR_ROUTE: #only received by a leader cond = threading.Condition() write_thread = threading.Thread( target=self.routed_write_handler, args=(msg, cond)) write_thread.start() elif Msg_type( msg._m_type ) is Msg_type.WR_COMMIT_REQ: #received by non-leader node #DONE - add condition var in args, for wait,invoke cond = threading.Condition() non_leader_write_thread = threading.Thread( target=self.non_leader_write_handler, args=(msg, cond)) non_leader_write_thread.start() write_id = msg._data_dict['write_id'] self.write_conditions[write_id] = cond self.write_tids[ write_id] = non_leader_write_thread.ident self.thread_msg_qs[ non_leader_write_thread.ident] = queue.Queue() elif Msg_type( msg._m_type ) is Msg_type.WR_AGREED: #only received by a leader try: write_id = msg._data_dict['write_id'] self.thread_msg_qs[ self.write_tids[write_id]].put(msg) with self.write_conditions[write_id]: self.write_conditions[write_id].notify() except Exception as e: print("Exception : AGREED message\n", e) elif Msg_type( msg._m_type ) is Msg_type.WR_ABORT: #can be received by a leader or non-leader node try: write_id = msg._data_dict['write_id'] self.thread_msg_qs[ self.write_tids[write_id]].put(msg) with self.write_conditions[write_id]: self.write_conditions[write_id].notify( ) #wake up the thread to accept ABORT message from queue except Exception as e: print("Exception : ABORT message\n", e) elif Msg_type( msg._m_type ) is Msg_type.WR_COMMIT: #received by non-leader node try: write_id = msg._data_dict['write_id'] self.thread_msg_qs[ self.write_tids[write_id]].put(msg) with self.write_conditions[write_id]: self.write_conditions[write_id].notify( ) #wake up the thread to accept COMMIT message from queue except Exception as e: print("Exception : COMMIT message\n", e) elif Msg_type( msg._m_type ) is Msg_type.WR_ACK: #only received by a leader try: write_id = msg._data_dict['write_id'] self.thread_msg_qs[ self.write_tids[write_id]].put(msg) with self.write_conditions[write_id]: self.write_conditions[write_id].notify() except Exception as e: print("Exception : ACK message\n", e) elif Msg_type( msg._m_type ) is Msg_type.WR_REPLY: #received by node who is in contact with client for write opn try: write_req_id = msg._data_dict['write_req_id'] self.thread_msg_qs[ self.wrreq_tids[write_req_id]].put(msg) with self.wrreq_conditions[write_req_id]: self.wrreq_conditions[write_req_id].notify( ) except Exception as e: print("Exception : REPLY message\n", e) elif Msg_type(msg._m_type) is Msg_type.AN_ldr_info: if self.sponser_set is False: #if not received any sponsor reply yet self.sponser_set = True self.thread_msg_qs[self.main_thread_tid].put( msg) with self.AN_condition: self.AN_condition.notifyAll( ) #ask thread to wake up else: pass #new node to be added to table elif Msg_type( msg._m_type) is Msg_type.AN_add_to_network: self.network_dict[msg.get_data( 'key')] = msg.get_data( 'value') #populate network table self.last_node_id = msg.get_data( 'key' ) #keep the field updated in case leader fails print( "***************Updated Network Dict***************" ) print(self.network_dict) elif Msg_type( msg._m_type ) is Msg_type.AN_set_id: #new id assigned by leader if msg._source_host == self.ldr_ip and msg.get_data( 'port') == self.ldr_port: self.thread_msg_qs[self.main_thread_tid].put( msg) with self.AN_condition: self.AN_condition.notifyAll( ) #ask thread to wake up elif Msg_type(msg._m_type) is Msg_type.AN_FS_data: # self.file_system_port = s self.thread_msg_qs[self.main_thread_tid].put(msg) if self.file_system_name is None: with self.AN_condition: self.AN_condition.notifyAll( ) #ask thread to wake up else: pass continue elif Msg_type( msg._m_type ) is Msg_type.add_node: #sponsor node on receiving 'add_node' add_node_thread = threading.Thread( target=self.send_AN_ldr_info, args=( msg._source_host, msg.get_data('port'), )) add_node_thread.start() elif Msg_type( msg._m_type ) is Msg_type.AN_assign_id: #ask leader for new id AN_assign_id_thread = threading.Thread( target=self.assign_new_id, args=( msg._source_host, msg.get_data('port'), )) AN_assign_id_thread.start() elif Msg_type(msg._m_type) is Msg_type.AN_FS_data_req: send_file_system_thread = threading.Thread( target=self.send_file_system, args=( msg._source_host, msg.get_data('port'), )) send_file_system_thread.start() elif Msg_type(msg._m_type) is Msg_type.read_request: s.settimeout(None) read_thread = threading.Thread(target = self.send_file, args=(msg.get_data('filename'),\ msg.get_data('filedir'),s)) read_thread.start() continue elif Msg_type(msg._m_type) is Msg_type.ldr_proposal: # spawn a become_leader thread if it doesnt exist and pass future messages to it if self.is_leader: continue if self.become_ldr_tid is None: become_ldr_evnt = threading.Event() become_ldr_thread = threading.Thread( target=self.become_ldr_thread_fn, args=(become_ldr_evnt, )) become_ldr_thread.start() self.become_ldr_tid = become_ldr_thread.ident self.thread_msg_qs[ self.become_ldr_tid] = queue.Queue() elif Msg_type(msg._m_type) is Msg_type.new_ldr_id: # first check if this is a reply for earlier ldr_agreement sent from here: if msg.get_data('type') == 'reply': # send to become_leader_thread and let it take : self.thread_msg_qs[become_ldr_tid].put(msg) become_ldr_evnt.set() # print("xxxxxxxxxxx") # print(msg.get_data('id')) # print(msg.get_data('ip')) # print(msg.get_data('port')) if msg.get_data('type') == 'del_ldr': self.ldr_id = msg.get_data('id') self.ldr_ip = msg.get_data('ip') self.ldr_port = msg.get_data('port') self.ldr_alive = True print("DEBUG_MSG: New leader:", self.ldr_id) if self.ldr_id == self.node_id: self.is_leader = True # new_msg = Message(Msg_type['new_ldr_id'],msg_id = (self.node_id,threading.current_thread().ident)) # new_recv = (self.network_dict[msg._msg_id[0]][0],self.network_dict[msg._msg_id[0]][1]) # new_msg._data={'type':'ldr_changed','ans':'ACK'} # try: # s.connect(new_recv) # except: # pass # else: # new_msg._source_host,new_msg._source_port = s.getsockname() # new_msg._recv_host,new_msg._recv_port = new_recv # send_msg(s, new_msg) # if it is a msg from some other node and seeks vote for itself else: self.ldr_stat_lock.acquire() self.ldr_timeout_count = -1 * self.timeout_thresh self.ldr_id = msg.get_data('id') self.ldr_ip = msg.get_data('ip') self.ldr_port = msg.get_data('port') print("DEBUG_MSG: new leader found: ", self.ldr_id) if self.ldr_agreement_fn(msg._msg_id[0]): new_msg = Message( Msg_type['new_ldr_id'], msg_id=( self.node_id, threading.current_thread().ident)) new_recv = ( self.network_dict[msg._msg_id[0]][0], self.network_dict[msg._msg_id[0]][1]) new_msg._data_dict = { 'type': 'reply', 'ans': 'ACK' } try: s.connect(new_recv) except: pass else: new_msg._source_host, new_msg._source_port = s.getsockname( ) new_msg._recv_host, new_msg._recv_port = new_recv send_msg(s, new_msg) self.ldr_alive = True self.ldr_stat_lock.release() elif Msg_type(msg._m_type) is Msg_type.send_metadata: # send meta-data to dictionary new_recv = (self.network_dict[msg._msg_id[0]][0], self.network_dict[msg._msg_id[0]][1]) msg = Message( Msg_type['metadata_info'], msg_id=(self.node_id, threading.current_thread().ident)) msg._recv_host, msg._recv_port = new_recv msg._data_dict = {'meta-data': self.meta_data} with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: msg._source_host, msg._source_port = s.getsockname( ) try: s.connect(new_recv) except: pass else: send_msg(s, msg) elif Msg_type(msg._m_type) is Msg_type.metadata_info: self.thread_msg_qs[self.become_ldr_tid].put(msg) elif Msg_type(msg._m_type) is Msg_type.delete_node: delete_node_thread = threading.Thread( target=self.del_from_network_dict, args=(msg, )) delete_node_thread.start() # self.del_from_network_dict(msg) elif Msg_type(msg._m_type) is Msg_type.init_delete: # init_delete_thread = threading.Thread(target = self.init_delete, args=()) # init_delete_thread_id = init_delete_thread.ident # self.thread_msg_qs[init_delete_thread_id] = queue.Queue() self.init_delete() # # sending back ACK # data = ("ACK - data received: "+str(data)).encode() # message_queues[s].put(data) # # add s as a connection waiting to send messages # if s not in outputs: # outputs.append(s) try: self.inputs.remove(s) s.close() except: pass for s in writable: # If something has to be sent - send it. Else, remove connection from output queue if not message_queues[s].empty(): # if some item is present - send it next_msg = message_queues[s].get() send_msg(s, next_msg) #s.send(next_msg) else: # indicate that server has nothing to send outputs.remove(s) for s in exceptional: # remove this connection and all its existences self.inputs.remove(s) if s in outputs: outputs.remove(s) s.close() del message_queues[s]
def heartbeat_thread_fn(self): ''' Does all processes related to heartbeat receiving and sending ''' self.pause_heartbeat = False self.thread_msg_qs[threading.current_thread().ident] = queue.Queue() heartbeat_msg = Message(Msg_type['heartbeat'], msg_id=(self.node_id, threading.current_thread().ident)) # dict of type [node_id : count of time-outs] node_timeouts = {n_id: -1 for n_id in self.network_dict.keys() } # initiate time-out counts while True: # for a leader node if self.is_leader: if self.pause_heartbeat: continue responded_nodes = [] # Collect all messages from queue: q = self.thread_msg_qs[threading.current_thread().ident] while not q.empty(): hmsg = q.get() responded_nodes.append(hmsg._msg_id[0]) # correct time-out counts for n_id in self.network_dict.keys(): if n_id not in responded_nodes: try: node_timeouts[n_id] += 1 except: node_timeouts[n_id] = 1 else: node_timeouts[n_id] = 0 # Check if someone has not responded for long: to_del = [] for n_id in self.network_dict.keys(): if node_timeouts[n_id] >= self.timeout_thresh: print("NODE : ", n_id, " found unresponsive") # TODO: what now? - initiate node deletion phase to_del.append(n_id) # delete in self and send to all for n_id in to_del: try: del self.network_dict[n_id] except: pass try: del node_timeouts[n_id] except: pass for n_to_delete in to_del: del_msg = Message( Msg_type['delete_node'], msg_id=(self.node_id, threading.current_thread().ident)) for n_id in self.network_dict: new_recv = (self.network_dict[n_id][0], self.network_dict[n_id][1]) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: s.connect(new_recv) except: pass else: del_msg._source_host, del_msg._source_port = s.getsockname( ) del_msg._recv_host, del_msg._recv_port = new_recv del_msg._msg_id = ( self.node_id, threading.current_thread().ident) del_msg._data_dict = {'id': n_to_delete} send_msg(s, del_msg) # Send a heartbeat to everyone and start a timer for n_id in self.network_dict.keys(): # send messages to all using temporary port with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: s.connect((self.network_dict[n_id][0], self.network_dict[n_id][1])) except: pass else: heartbeat_msg._source_host, heartbeat_msg._source_port = s.getsockname( ) heartbeat_msg._recv_host, heartbeat_msg._recv_port, state = self.network_dict[ n_id] heartbeat_msg._msg_id = ( self.node_id, threading.current_thread().ident) heartbeat_msg._data_dict = {} send_msg(s, heartbeat_msg) # re-starting timer time.sleep(self.heartbeat_delay) # for a non-leader node else: if self.pause_heartbeat: continue got_ldr_hbeat = False q = self.thread_msg_qs[threading.current_thread().ident] while not q.empty(): hmsg = q.get() if ((hmsg.get_data('type') is not None) and (hmsg.get_data('type') == 'reply')): continue self.ldr_timeout_count = 0 # reply to heartbeat with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: hbeat_id = hmsg._msg_id[0] if not (hbeat_id == self.node_id): hmsg_ip, hmsg_port, state = self.network_dict[ hbeat_id] else: hmsg_ip, hmsg_port, state = (self.HOST, self.PORT, 1) if hbeat_id == self.ldr_id: got_ldr_hbeat = True try: s.connect((hmsg_ip, hmsg_port)) except: pass else: heartbeat_msg._source_host, heartbeat_msg._source_port = s.getsockname( ) heartbeat_msg._recv_host, heartbeat_msg._recv_port = ( hmsg_ip, hmsg_port) heartbeat_msg._msg_id = ( self.node_id, threading.current_thread().ident) heartbeat_msg._data_dict = {'type': 'reply'} send_msg(s, heartbeat_msg) if self.ldr_alive: if not got_ldr_hbeat: self.ldr_timeout_count += 1 else: self.ldr_timeout_count = 0 # check if leader has failed self.ldr_stat_lock.acquire() if self.ldr_timeout_count >= self.timeout_thresh: self.ldr_timeout_count = 0 print("Leader failure detected") self.ldr_alive = False try: del self.network_dict[self.ldr_id] except: pass leader_elect_thread = threading.Thread( target=self.ldrelect_thread_fn, args=()) leader_elect_thread.start() self.ldr_elect_tid = leader_elect_thread.ident self.ldr_stat_lock.release() # re-rstarting timer time.sleep(self.ldr_heartbeat_delay)
def heartbeat_thread_fn(self): ''' Does all processes related to heartbeat receiving and sending ''' self.thread_msg_qs[threading.current_thread().ident] = queue.Queue() # for a leader node if self.is_leader: # initiate time-out counts # dict of type [node_id : count of time-outs] node_timeouts = {n_id: -1 for n_id in self.network_dict.keys()} heartbeat_msg = Message(Msg_type['heartbeat']) while True: responded_nodes = [] # Collect all messages from queue: q = self.thread_msg_qs[threading.current_thread().ident] while not q.empty(): hmsg = q.get() print("DEBUG_MSG: got heartbeat_msg from: ", (hmsg._source_host, hmsg._source_port)) responded_nodes.append( (hmsg._source_host, hmsg._source_port)) # correct time-out counts for n_id, val in self.network_dict.items(): if val not in responded_nodes: node_timeouts[n_id] += 1 else: node_timeouts[n_id] = 0 # Check if someone has not responded for long: for n_id in self.network_dict.keys(): if node_timeouts[n_id] >= self.timeout_thresh: print("NODE : ", n_id, " found unresponsive") # TODO: what now? - initiate node deletion phase # Send a heartbeat to everyone and start a timer for n_id in self.network_dict.keys(): # send messages to all using temporary port with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: s.connect((self.network_dict[n_id][0], self.network_dict[n_id][1])) heartbeat_msg._source_host, heartbeat_msg._source_port = s.getsockname( ) heartbeat_msg._recv_host, heartbeat_msg._recv_port = self.network_dict[ n_id] send_msg(s, heartbeat_msg) except: pass # re-starting timer time.sleep(self.heartbeat_delay) # for a non-leader node else: ldr_timeout_count = -1 while True: q = self.thread_msg_qs[threading.current_thread().ident] if not q.empty(): hmsg = q.get() print("DEBUG_MSG: got heartbeat_msg from: ", (hmsg._source_host, hmsg._source_port)) ldr_timeout_count = 0 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: s.connect((self.ldr_ip, self.ldr_port)) heartbeat_msg._source_host, heartbeat_msg._source_port = s.getsockname( ) heartbeat_msg._recv_host, heartbeat_msg._recv_port = self.network_dict[ n_id] send_msg(s, heartbeat_msg) except: pass else: ldr_timeout_count += 1 # check if leader has failed if ldr_timeout_count >= self.timeout_thresh: print("Leader failure detected") # TODO: initiate leader election protocol # re-rstarting timer time.sleep(self.ldr_heartbeat_delay)
def ldrelect_thread_fn(self): """ Tasked with the selection of the new leader """ # TODO: delete its entry from everywhere while exiting print("DEBUG_MSG: Leader Election started ") self.thread_msg_qs[threading.get_ident()] = queue.Queue() heartbeat_msg = Message(Msg_type['heartbeat']) has_leader = False nodes = list(self.network_dict.keys()) nodes.append(self.node_id) while not has_leader and not self.ldr_alive: nodes = sorted(nodes) print(nodes) # if this is itself the smallest id node if nodes[0] == self.node_id: msg = Message(Msg_type['ldr_proposal']) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: s.connect((self.HOST, self.PORT)) except: pass # go to outer while loop and re-start process else: msg._source_host, msg._source_port = s.getsockname() msg._recv_host, msg._recv_port = (self.HOST, self.PORT) msg._msg_id = (self.node_id, threading.current_thread().ident) # assume that beyond this point, the found node stays alive... # ... or, this thread begins later again or in some other node has_leader = True send_msg(s, msg) # clear its existence before exiting self.ldr_elect_tid = None self.thread_msg_qs.pop(threading.get_ident(), None) return for n_id in nodes: if n_id == self.node_id: continue print("DEBUG_MSG: sending heartbeat from ldr_elect to: ", n_id) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: s.connect((self.network_dict[n_id][0], self.network_dict[n_id][1])) except: pass else: heartbeat_msg._source_host, heartbeat_msg._source_port = s.getsockname( ) heartbeat_msg._recv_host, heartbeat_msg._recv_port, status = self.network_dict[ n_id] heartbeat_msg._msg_id = (self.node_id, threading.current_thread().ident) send_msg(s, heartbeat_msg) # now, the coordinator is responsible to pass the heartbeat messages into this thread # wait for timeout amount of time before deciding which all are alive # TODO: need to wait for multiple time-outs? time.sleep(self.heartbeat_delay * self.timeout_thresh) responded_nodes = set([self.node_id]) q = self.thread_msg_qs[threading.current_thread().ident] while not q.empty(): msg = q.get() responded_nodes.add(msg._msg_id[0]) print("DEBUG_MSG: responded_nodes: ", responded_nodes) prospective_ldr = min(responded_nodes) msg = Message(Msg_type['ldr_proposal']) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: if not (prospective_ldr == self.node_id): new_recv = (self.network_dict[prospective_ldr][0], self.network_dict[prospective_ldr][1]) else: new_recv = (self.HOST, self.PORT) try: s.connect(new_recv) except: pass # go to outer while loop and re-start process else: msg._source_host, msg._source_port = s.getsockname() msg._recv_host, msg._recv_port = new_recv msg._msg_id = (self.node_id, threading.current_thread().ident) # assume that beyond this point, the found node stays alive... # ... or, this thread begins later again or in some other node has_leader = True send_msg(s, msg) # clear its existence before exiting self.ldr_elect_tid = None self.thread_msg_qs.pop(threading.get_ident(), None)
def become_ldr_thread_fn(self, evnt): # send ldr_agreement msg to all and wait for returns ldr_elected = False msg = Message(Msg_type['new_ldr_id'], msg_id=(self.node_id, threading.current_thread().ident)) msg._data_dict = { 'id': self.node_id, 'ip': self.HOST, 'port': self.PORT, 'type': 'proposal' } for n_id in self.network_dict.keys(): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: print("Become Leader started: ", n_id) new_recv = (self.network_dict[n_id][0], self.network_dict[n_id][1]) try: s.connect(new_recv) except: continue else: msg._source_host, msg._source_port = s.getsockname() msg._recv_host, msg._recv_port = new_recv print("DEBUG_MSG: sending new leader msg to :", n_id) send_msg(s, msg) try: del self.thread_msg_qs[thread_to_kill.ident] except: pass # kill this thread self.is_leader = True self.ldr_id = self.node_id self.ldr_port = self.PORT self.ldr_ip = self.HOST self.ldr_alive = True print("Leader election complete: ", self.ldr_id, " ", self.ldr_port) ############################## # wait for 1 complete heartbeat cycle - to update network table print("UPDATING NETWORK TABLE") time.sleep(10) #self.heartbeat_delay*self.timeout_thresh + print("INITIATING CONSISTENCY CHECK:") new_msg = Message(Msg_type['send_metadata'], msg_id=(self.node_id, threading.current_thread().ident)) # send to all alive nodes - try a max_tries number of times: for n_id in self.network_dict.keys(): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: new_recv = (self.network_dict[n_id][0], self.network_dict[n_id][1]) new_msg._source_host, new_msg._source_port = s.getsockname() new_msg._recv_host, new_msg._recv_port = new_recv for i in range(self.max_tries): try: s.connect(new_recv) send_msg(s, new_msg) except: continue else: break # wait on queue for all meta datas: q = self.thread_msg_qs[threading.current_thread().ident] # received all metadatas in queue # TODO: handle the waiting differently? node_metadata_dict = {} # all node-ids with metadata count_tries = 0 while len(node_metadata_dict) != len( self.network_dict) and count_tries < self.max_tries: time.sleep(self.heartbeat_delay * self.timeout_thresh) while not q.empty(): msg = q.get() if not (msg._m_type is Msg_type.metadata_info): continue node_metadata_dict[msg._msg_id[0]] = msg._data_dict['meta-data'] count_tries += 1 # since this far exceeds heartbeat timeouts, all nodes that could respond have responded file_version_dict = { } # stores file path with corresponding latest version number and id of node with latest data all_files_set = set() for n_id, metadata in node_metadata_dict.items(): for entry, data in metadata.items(): if data[0] == 0: # ignore directories continue else: all_files_set.add(entry) # collected all files for file in all_files_set: # data for a file has: [latest version no, node with latest version, is there inconsistency] file_version_dict[file] = [-1, self.node_id, False] if file not in self.meta_data: file_version_dict[file][2] = True file_version_dict[file][0] = -1 else: file_version_dict[file][0] = self.meta_data[file][2] for node, metadata in node_metadata_dict.items(): file_metadata = metadata[file] if file_version_dict[file][0] != file_metadata[2]: file_version_dict[file][2] = True if file_version_dict[file][0] < file_metadata[2]: file_version_dict[file][0] = file_metadata[2] file_version_dict[file][1] = node # We now have records of most recent file versions AND the nodes which have them # # Now, update itself to latest version of all files: - use read command # for file in all_files_set: # if file not in self.meta_data or file_version_dict[0] > self.meta_data[file][2]: # file_version_dict[file][2] = True # # copy (possibly large) file from another server # pass # Update all files that have inconsistency somewhere inconsistent_files = set() for file, entry in file_version_dict.items(): if entry[2]: inconsistent_files.add(file) print("DEBUG_MSG: found %d number(s) of inconsistencies: " % (len(inconsistent_files))) for file, entry in file_version_dict.items(): if entry[2]: # found inconsistency.... update everywhere new_recv = (self.network_dict[entry[1]][0], self.network_dict[entry[1]][1]) msg = Message(Msg_type['cons_req'], msg_id=()) # filename, filedir, file,write_id msg._data_dict = {'filepath': file} new_recv = (self.HOST, self.PORT) msg._recv_host, msg._recv_port = new_recv with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: msg._source_host, msg._source_port = s.getsockname() try: s.connect(new_recv) except: pass else: send_msg(s, new_msg) ack = recv_msg(s) # write req; status status = ack.get_data['status'] print("DEBUG_MSG: Consistency Check: file: ", file, ": status: ", status) # wipe off its existence try: del self.thread_msg_qs[thread_to_kill.ident] except: pass self.become_ldr_tid = None