def __process_queue(self): self.msg_queue_lock.acquire() n = len(self.msg_queue) while n > 0: msg = self.msg_queue.pop() assert isinstance(msg, CustomMessage) src_node = self.nodes[msg.source] if msg.destination not in self.nodes: log.warning("Message with unknown destination {}".format(msg.destination)) dst_node = None else: dst_node = self.nodes[msg.destination] deliver = True for p in self.partitions.values(): if p.are_partitioned(src_node, dst_node): deliver = False if deliver: self.backend.send_message(msg.destination, msg) else: self.msg_queue.appendleft(msg) n -= 1 self.msg_queue_lock.release()
def __process_queue(self): self.msg_queue_lock.acquire() n = len(self.msg_queue) while n > 0: msg = self.msg_queue.pop() assert isinstance(msg, CustomMessage) src_node = self.nodes[msg.source] if msg.destination not in self.nodes: log.warning("Message with unknown destination {}".format( msg.destination)) dst_node = None else: dst_node = self.nodes[msg.destination] deliver = True for p in self.partitions.values(): if p.are_partitioned(src_node, dst_node): deliver = False if deliver: self.backend.send_message(msg.destination, msg) else: self.msg_queue.appendleft(msg) n -= 1 self.msg_queue_lock.release()
def wait_for_state(self, state): self.cv.acquire() while self.state not in (state, Node.STATE_FAILED, Node.STATE_CRASHED): self.cv.wait() self.cv.release() if self.state != state: log.warning("Node entered {} state while waiting for state {}".format( Node.state_str[self.state], Node.state_str[state]))
def wait_for_state(self, state): self.cv.acquire() while self.state not in (state, Node.STATE_FAILED, Node.STATE_CRASHED): self.cv.wait() self.cv.release() if self.state != state: log.warning( "Node entered {} state while waiting for state {}".format( Node.state_str[self.state], Node.state_str[state]))
def hello_sender(tries_left): if node_id not in self.node_zid: self.__send_zmq_message(zmq_msg) tries_left -= 1 if tries_left > 0: self.loop.add_timeout(self.loop.time() + 1, hello_sender, tries_left = tries_left) if tries_left == 0: log.warning("Node %s did not respond to hello message" % node_id) self.ds.nodes[node_id].set_state(Node.STATE_FAILED)
def recover_node(self, node_id, deliver): log.warning("recovering {}".format(node_id)) if not node_id in self.nodes: raise ChistributedException("No such node: {}".format(node_id)) n = self.nodes[node_id] if n.state != Node.STATE_PARTITIONED: raise ChistributedException("Node {} is not in a failed state".format(node_id)) self.remove_partition(self.__failed_node_partition_name(node_id), deliver) n.set_state(Node.STATE_RUNNING)
def hello_sender(tries_left): if node_id not in self.node_zid: self.__send_zmq_message(zmq_msg) tries_left -= 1 if tries_left > 0: self.loop.add_timeout(self.loop.time() + 1, hello_sender, tries_left=tries_left) if tries_left == 0: log.warning("Node %s did not respond to hello message" % node_id) self.ds.nodes[node_id].set_state(Node.STATE_FAILED)
def remove_partition(self, name, deliver): if name not in self.partitions: raise ChistributedException("No such partition: %s" % name) # Acquire lock to prevent messages being added to message queue # while we remove the partition self.msg_queue_lock.acquire() self.__process_partitioned_messages(self.partitions[name], deliver) del self.partitions[name] self.msg_queue_lock.release() log.warning("Removing partition {}".format(name))
def fail_node(self, node_id): log.warning("failing {}".format(node_id)) if not node_id in self.nodes: raise ChistributedException("No such node: {}".format(node_id)) n = self.nodes[node_id] if n.state == Node.STATE_PARTITIONED: raise ChistributedException("Node {} is already in a failed state".format(node_id)) if n.state != Node.STATE_RUNNING: raise ChistributedException("Node {} cannot be failed because it is not running".format(node_id)) self.add_partition(self.__failed_node_partition_name(node_id), [node_id]) n.set_state(Node.STATE_PARTITIONED)
def stop_node(self, node_id): ''' Sends SIGTERM to the named node. Node implementations should catch it and shutdown because killing procs is risky business. ''' log.info("Stopping node " + node_id) rc = self.node_pids[node_id].poll() if rc is not None: rc = self.node_pids[node_id].wait() log.warning("Node {} had already exited (rc = {})".format(node_id, rc)) else: self.node_pids[node_id].terminate() del self.node_pids[node_id]
def hello_sender(tries_left): if node_id not in self.node_zid: self.__send_zmq_message(zmq_msg) tries_left -= 1 if tries_left > 0: self.loop.add_timeout(self.loop.time() + 1, hello_sender, tries_left = tries_left) if tries_left == 0: log.warning("Node %s did not respond to hello message" % node_id) # Check whether the node has died rc = self.node_pids[node_id].poll() if rc is not None: self.ds.nodes[node_id].set_state(Node.STATE_CRASHED) log.warning("Node %s has crashed (rc = %i)" % (node_id, rc)) self.loop.stop() else: self.ds.nodes[node_id].set_state(Node.STATE_FAILED)
def stop_node(self, node_id): ''' Sends SIGTERM to the named node. Node implementations should catch it and shutdown because killing procs is risky business. ''' log.info("Stopping node " + node_id) rc = self.node_pids[node_id].poll() if rc is not None: rc = self.node_pids[node_id].wait() log.warning("Node {} had already exited (rc = {})".format( node_id, rc)) else: self.node_pids[node_id].terminate() del self.node_pids[node_id]
def add_partition(self, name, nodes1, nodes2 = None): if name in self.partitions: raise ChistributedException("A partition named '%s' already exists" % name) for n in nodes1: if n not in self.nodes: raise ChistributedException("No such node: %s" % n) if nodes2 is None: nodes2 = [n for n in self.nodes if n not in nodes1] else: for n in nodes2: if n not in self.nodes: raise ChistributedException("No such node: %s" % n) p = Partition(name, [self.nodes[n] for n in nodes1], [self.nodes[n] for n in nodes2]) self.partitions[name] = p log.warning("Creating partition {}".format(name))
def __process_partitioned_messages(self, p, deliver): n = len(self.msg_queue) while n > 0: msg = self.msg_queue.pop() assert isinstance(msg, CustomMessage) src_node = self.nodes[msg.source] if msg.destination not in self.nodes: log.warning("Message with unknown destination {}".format(msg.destination)) dst_node = None else: dst_node = self.nodes[msg.destination] if p.are_partitioned(src_node, dst_node): if deliver: self.backend.send_message(msg.destination, msg) else: self.msg_queue.appendleft(msg) n -= 1
def hello_sender(tries_left): if node_id not in self.node_zid: self.__send_zmq_message(zmq_msg) tries_left -= 1 if tries_left > 0: self.loop.add_timeout(self.loop.time() + 1, hello_sender, tries_left=tries_left) if tries_left == 0: log.warning("Node %s did not respond to hello message" % node_id) # Check whether the node has died rc = self.node_pids[node_id].poll() if rc is not None: self.ds.nodes[node_id].set_state(Node.STATE_CRASHED) log.warning("Node %s has crashed (rc = %i)" % (node_id, rc)) self.loop.stop() else: self.ds.nodes[node_id].set_state(Node.STATE_FAILED)
def __process_partitioned_messages(self, p, deliver): n = len(self.msg_queue) while n > 0: msg = self.msg_queue.pop() assert isinstance(msg, CustomMessage) src_node = self.nodes[msg.source] if msg.destination not in self.nodes: log.warning("Message with unknown destination {}".format( msg.destination)) dst_node = None else: dst_node = self.nodes[msg.destination] if p.are_partitioned(src_node, dst_node): if deliver: self.backend.send_message(msg.destination, msg) else: self.msg_queue.appendleft(msg) n -= 1