def __init__(self, simulation, **kwargs): ## Initialize the replica super(RaftReplica, self).__init__(simulation, **kwargs) ## Initialize Raft Specific settings self.state = State.FOLLOWER self.currentTerm = 0 self.votedFor = None self.log = MultiObjectWriteLog() self.cache = {} ## Policies self.read_policy = ReadPolicy.get(kwargs.get('read_policy', READ_POLICY)) self.aggregate_writes = kwargs.get('aggregate_writes', AGGREGATE_WRITES) ## Timers for work eto = kwargs.get('election_timeout', ELECTION_TIMEOUT) hbt = kwargs.get('heartbeat_interval', HEARTBEAT_INTERVAL) self.timeout = ElectionTimer.fromReplica(self, eto) self.heartbeat = Timer(self.env, hbt, self.on_heartbeat_timeout) ## Leader state self.nextIndex = None self.matchIndex = None
def gossip(self): """ Randomly select a neighbor and exchange information about the state of the latest entries in the log since the last anti-entropy delay. """ # Gossip to one node at each location for location in self.locations: # Don't gossip to nodes in self! if location == self.location: continue # Select a random target to gossip to target = random.choice(list(self.remotes(location))) # Log the gossip that's happening self.sim.logger.debug("{} gossiping {} entries to {}".format( self, len(self.ae_cache), target)) entries = tuple([ Write(version.name, self, version) for version in self.ae_cache ]) # Send all the values in the cache. self.send(target, Gossip(entries, len(self.ae_cache), -1)) # Empty the cache on gossip self.ae_cache = [] # Reset the anti-entropy timer self.ae_timer = Timer(self.env, self.ae_delay, self.gossip) self.ae_timer.start()
def get_anti_entropy_timeout(self): """ Creates the anti-entropy timeout. In the future this could be random timeout not fixed. """ self.timeout = Timer(self.env, self.ae_delay, self.gossip) return self.timeout.start()
def handle_session(self): """ Starts a session timer if one isn't running, otherwise resets the currently running session timer on an additional access. """ if not self.session: self.session = Timer( self.env, self.session_timeout, partial(self.on_session_timeout, self.env.now)) else: self.session = self.session.reset()
def run(self): """ We have to check in at every heartbeat interval. If we own a tag then send a heartbeat message, otherwise just keep quiescing. """ while True: if self.state == State.OWNER: self.heartbeat = Timer(self.env, self.heartbeat_interval, self.on_heartbeat_timeout) yield self.heartbeat.start() else: yield self.env.timeout(self.heartbeat_interval)
def gossip(self): """ Randomly select a neighbor and exchange information about the state of the latest entries in the log since the last anti-entropy delay. """ # Gossip to one node at each location for location in self.locations: # Don't gossip to nodes in self! if location == self.location: continue # Select a random target to gossip to target = random.choice(list(self.remotes(location))) # Log the gossip that's happening self.sim.logger.debug( "{} gossiping {} entries to {}".format( self, len(self.ae_cache), target ) ) entries = tuple([ Write(version.name, self, version) for version in self.ae_cache ]) # Send all the values in the cache. self.send(target, Gossip(entries, len(self.ae_cache), -1)) # Empty the cache on gossip self.ae_cache = [] # Reset the anti-entropy timer self.ae_timer = Timer(self.env, self.ae_delay, self.gossip) self.ae_timer.start()
def __init__(self, simulation, **kwargs): ## Initialize the replica super(RaftReplica, self).__init__(simulation, **kwargs) ## Initialize Raft Specific settings self.state = State.FOLLOWER self.currentTerm = 0 self.votedFor = None self.log = MultiObjectWriteLog() self.cache = {} ## Policies self.read_policy = ReadPolicy.get(kwargs.get("read_policy", READ_POLICY)) self.aggregate_writes = kwargs.get("aggregate_writes", AGGREGATE_WRITES) ## Timers for work eto = kwargs.get("election_timeout", ELECTION_TIMEOUT) hbt = kwargs.get("heartbeat_interval", HEARTBEAT_INTERVAL) self.timeout = ElectionTimer.fromReplica(self, eto) self.heartbeat = Timer(self.env, hbt, self.on_heartbeat_timeout) ## Leader state self.nextIndex = None self.matchIndex = None
def handle_session(self): """ Starts a session timer if one isn't running, otherwise resets the currently running session timer on an additional access. """ if not self.session: self.session = Timer( self.env, self.session_timeout, partial(self.on_session_timeout, self.env.now) ) else: self.session = self.session.reset()
def on_state_change(self): """ Does the same stuff as super, but also - if leader; starts the anti entropy interval to do gossiping. """ super(FloatedRaftReplica, self).on_state_change() if self.state in (State.FOLLOWER, State.CANDIDATE): if hasattr(self, 'ae_timer') and self.ae_timer is not None: # Cancel the anti-entropy timer. self.ae_timer.stop() self.ae_timer = None elif self.state == State.LEADER: self.ae_timer = Timer(self.env, self.ae_delay, self.gossip) self.ae_timer.start() elif self.state == State.READY: # This happens on the call to super, just ignore for now. pass else: raise SimulationException( "Unknown Floating Raft State: {!r} set on {}".format( self.state, self))
def run(self): """ We have to check in at every heartbeat interval. If we own a tag then send a heartbeat message, otherwise just keep quiescing. """ while True: if self.state == State.OWNER: self.heartbeat = Timer( self.env, self.heartbeat_interval, self.on_heartbeat_timeout ) yield self.heartbeat.start() else: yield self.env.timeout(self.heartbeat_interval)
def on_state_change(self): """ Does the same stuff as super, but also - if leader; starts the anti entropy interval to do gossiping. """ super(FloatedRaftReplica, self).on_state_change() if self.state in (State.FOLLOWER, State.CANDIDATE): if hasattr(self, 'ae_timer') and self.ae_timer is not None: # Cancel the anti-entropy timer. self.ae_timer.stop() self.ae_timer = None elif self.state == State.LEADER: self.ae_timer = Timer(self.env, self.ae_delay, self.gossip) self.ae_timer.start() elif self.state == State.READY: # This happens on the call to super, just ignore for now. pass else: raise SimulationException( "Unknown Floating Raft State: {!r} set on {}".format(self.state, self) )
class FloatedRaftReplica(RaftReplica): def __init__(self, simulation, **kwargs): ## Initialize the replica super(FloatedRaftReplica, self).__init__(simulation, **kwargs) # Anti entropy settings self.ae_delay = kwargs.get('anti_entropy_delay', ANTI_ENTROPY_DELAY) self.ae_timer = None self.ae_cache = [] @memoized def locations(self): """ Returns all the locations in the network with Raft nodes. """ return set([ node.location for node in self.neighbors(self.consistency) ]) def quorum(self): """ Returns only nodes in the same location to do Raft consensus with. """ # Filter only connections that are in the same consistency group for node in self.neighbors(self.consistency): if node.location == self.location: yield node # Don't forget to yield self! yield self def remotes(self, location=None): """ Returns only nodes that are not in the same location to float writes to using anti-entropy. This method is only used by the leader. Can also specify a specific location to fetch the remotes for. Note that specifying your current location will not return nodes. """ # Filter only connections that are in the same consistency group for node in self.neighbors(self.consistency): if node.location != self.location: if location is not None and node.location != location: continue yield node def gossip(self): """ Randomly select a neighbor and exchange information about the state of the latest entries in the log since the last anti-entropy delay. """ # Gossip to one node at each location for location in self.locations: # Don't gossip to nodes in self! if location == self.location: continue # Select a random target to gossip to target = random.choice(list(self.remotes(location))) # Log the gossip that's happening self.sim.logger.debug( "{} gossiping {} entries to {}".format( self, len(self.ae_cache), target ) ) entries = tuple([ Write(version.name, self, version) for version in self.ae_cache ]) # Send all the values in the cache. self.send(target, Gossip(entries, len(self.ae_cache), -1)) # Empty the cache on gossip self.ae_cache = [] # Reset the anti-entropy timer self.ae_timer = Timer(self.env, self.ae_delay, self.gossip) self.ae_timer.start() ###################################################################### ## Event Handlers ###################################################################### def on_state_change(self): """ Does the same stuff as super, but also - if leader; starts the anti entropy interval to do gossiping. """ super(FloatedRaftReplica, self).on_state_change() if self.state in (State.FOLLOWER, State.CANDIDATE): if hasattr(self, 'ae_timer') and self.ae_timer is not None: # Cancel the anti-entropy timer. self.ae_timer.stop() self.ae_timer = None elif self.state == State.LEADER: self.ae_timer = Timer(self.env, self.ae_delay, self.gossip) self.ae_timer.start() elif self.state == State.READY: # This happens on the call to super, just ignore for now. pass else: raise SimulationException( "Unknown Floating Raft State: {!r} set on {}".format(self.state, self) ) def on_gossip_rpc(self, message): """ Handles the receipt of a gossip from another node. Expects multiple accesses (Write events) as entries. Goes through all and compares the versions, replying False only if there is an error or a conflict. """ entries = message.value.entries # Go through the entries from the RPC and write to local cluster. for access in entries: access.version.gossiped = True self.write(access) # Should we return with what's in our cache? # Respond to the sender self.send(message.source, GossipResponse([], 0, True, -1)) def on_response_rpc(self, message): """ Just receives the acknowledgment of the response. """ pass def on_ae_response_rpc(self, msg): """ Does the same stuff that the super handler does, but also caches commits to gossip about them later! """ rpc = msg.value if self.state == State.LEADER: if rpc.success: self.nextIndex[msg.source] = rpc.lastLogIndex + 1 self.matchIndex[msg.source] = rpc.lastLogIndex else: # Decrement next index and retry append entries self.nextIndex[msg.source] -= 1 self.send_append_entries(msg.source) # Decide if we can commit the entry for n in xrange(self.log.lastApplied, self.log.commitIndex, -1): commit = Election(self.matchIndex.keys()) for k, v in self.matchIndex.iteritems(): commit.vote(k, v >= n) if commit.has_passed() and self.log[n][1] == self.currentTerm: # Commit all versions from the last log entry to now. for idx in xrange(self.log.commitIndex, n+1): if self.log[idx][0] is None: continue # Cache the version to anti-entropy! version = self.log[idx][0] if not hasattr(version, 'gossiped') or not version.gossiped: self.ae_cache.append(version) self.log[idx][0].update(self, commit=True) # Set the commit index and break self.log.commitIndex = n break elif self.state == State.CANDIDATE: # Decide whether or not to step down. if rpc.term >= self.currentTerm: ## Become a follower self.state = State.FOLLOWER ## Log the failed election self.sim.logger.info( "{} has stepped down as candidate".format(self) ) return elif self.state == State.FOLLOWER: # Ignore AE messages if we are the follower. return else: raise RaftRPCException( "Append entries response in unknown state: '{}'".format(self.state) )
def read(self, name, **kwargs): """ When a tag replica performs a read it has to decide whether or not to read locally or to make a remote read across the cluster. Convert the read into an access, then check if we own the object. If we do, then return the latest commit. If we don't and no one else does either, attempt to acquire the tag. If we don't and someone else does then either drop, wait, or remote. Current implementation: #2, MR, no remote access. If someone else owns tag, reads are dropped. TODO: Remote vs Local Reads """ # Create the read event using super. access = super(TagReplica, self).read(name, **kwargs) # Record the number of attempts for the access if access.is_local_to(self): access.attempts += 1 # Increase the session on access. self.handle_session() # Are we the owner of this tag? if self.owns(access.name): # TODO: Change to last commit! version = self.log[access.name].lastVersion # If the version is None, bail since we haven't read anything if version is None: return access.drop(empty=True) # Update the version, complete the read, and log the access access.update(version, completed=True) access.log(self) # Return, we're done reading! return access # Is there a different owner for the tag? owner = self.find_owner(access.name) if owner is not None: # Right now just drop the read on its face. self.sim.logger.info("ownership conflict: dropped {} at {}".format( access, self)) return access.drop() # We're going to acquire the tag! else: # Log the access from this particular replica. access.log(self) # We're going to have some read latency, retry the read. retry = Timer(self.env, self.heartbeat_interval, lambda: self.read(access)).start() if access.attempts <= 1 and self.state != State.TAGGING: # Request the ownership of the tag self.acquire(access.name) return access
class TagReplica(ConsensusReplica): def __init__(self, simulation, **kwargs): ## Timers for work self.session_timeout = kwargs.get('session_timeout', SESSION_TIMEOUT) self.heartbeat_interval = kwargs.get('heartbeat_interval', HEARTBEAT_INTERVAL) self.session = None self.heartbeat = None ## Initialze the tag specific settings self.epoch = 0 self.log = defaultdict(WriteLog) self.view = defaultdict(set) ## Owner state self.nextIndex = None self.matchIndex = None ## Initialize the replica super(TagReplica, self).__init__(simulation, **kwargs) self.state = State.READY ###################################################################### ## Core Methods (Replica API) ###################################################################### def read(self, name, **kwargs): """ When a tag replica performs a read it has to decide whether or not to read locally or to make a remote read across the cluster. Convert the read into an access, then check if we own the object. If we do, then return the latest commit. If we don't and no one else does either, attempt to acquire the tag. If we don't and someone else does then either drop, wait, or remote. Current implementation: #2, MR, no remote access. If someone else owns tag, reads are dropped. TODO: Remote vs Local Reads """ # Create the read event using super. access = super(TagReplica, self).read(name, **kwargs) # Record the number of attempts for the access if access.is_local_to(self): access.attempts += 1 # Increase the session on access. self.handle_session() # Are we the owner of this tag? if self.owns(access.name): # TODO: Change to last commit! version = self.log[access.name].lastVersion # If the version is None, bail since we haven't read anything if version is None: return access.drop(empty=True) # Update the version, complete the read, and log the access access.update(version, completed=True) access.log(self) # Return, we're done reading! return access # Is there a different owner for the tag? owner = self.find_owner(access.name) if owner is not None: # Right now just drop the read on its face. self.sim.logger.info("ownership conflict: dropped {} at {}".format( access, self)) return access.drop() # We're going to acquire the tag! else: # Log the access from this particular replica. access.log(self) # We're going to have some read latency, retry the read. retry = Timer(self.env, self.heartbeat_interval, lambda: self.read(access)).start() if access.attempts <= 1 and self.state != State.TAGGING: # Request the ownership of the tag self.acquire(access.name) return access def write(self, name, **kwargs): """ When a replica performs a write it needs to decide if it can write to the tag locally, can acquire a tag for this object, or if it has to do something else like drop, wait, or remote write. If the access is local: - if the replica owns the tag, append and complete - if someone else owns the tag then drop, wait, or remote - if no one owns the tag, then attempt to acquire it If access is remote: - if we own the tag, then append but do not complete (at local) - if someone else owns the tag, log and forward to owner - if no one owns the tag then respond false """ # Create the read event using super. access = super(TagReplica, self).write(name, **kwargs) # Increase the session on access. self.handle_session() # Determine if the write is local or remote if access.is_local_to(self): # Record the number of attempts for the access access.attempts += 1 # Fetch the latest version from the log. latest = self.log[access.name].lastVersion # Perform the write if latest is None: version = namespace(access.name)(self) else: version = latest.nextv(self) # Update the access with the latest version access.update(version) else: # If there is no version, raise an exception if access.version is None: raise AccessError( "Attempting a remote write on {} without a version!". format(self)) # Save the version variable for use below. version = access.version # Log the access at this replica access.log(self) # Are we the owner of this tag? if self.owns(access.name): # Perform the append entries self.log[name].append(version, self.epoch) # Update the version to track visibility latency version.update(self) # Complete the access if it was local if access.is_local_to(self): access.complete() # Now do AppendEntries # Also interrupt the heartbeat since we just sent AppendEntries if not settings.simulation.aggregate_writes: self.send_append_entries() if self.heartbeat: self.heartbeat.stop() return access # Is there a different owner for the tag? owner = self.find_owner(name) if owner is not None: # Right now just drop the write on its face. self.sim.logger.info("ownership conflict: dropped {} at {}".format( access, self)) return access.drop() # We're going to acquire the tag! else: # We're going to have some write latency, retry the write. retry = Timer(self.env, self.heartbeat_interval, lambda: self.write(access)).start() # Request the ownership of the tag self.acquire(access.name) return access def run(self): """ We have to check in at every heartbeat interval. If we own a tag then send a heartbeat message, otherwise just keep quiescing. """ while True: if self.state == State.OWNER: self.heartbeat = Timer(self.env, self.heartbeat_interval, self.on_heartbeat_timeout) yield self.heartbeat.start() else: yield self.env.timeout(self.heartbeat_interval) ###################################################################### ## Helper Methods ###################################################################### def owns(self, name): """ Returns True if the name is in the current view for that owner. """ return name in self.view[self] def find_owner(self, name): """ Looks up the owner of the name in the current view. Returns None if there is no owner fo the tag. """ for owner, tag in self.view.items(): if name in tag: return owner return None def acquire(self, tag): """ Sends out the acquire tag RPC """ # Construct the tag to send out if not isinstance(tag, (set, frozenset)): tag = frozenset([tag]) # Make sure to request the tag we already have tag = frozenset(self.view[self] | tag) # Request tag with all current tags self.send_tag_request(tag) # Log the tag acquisition self.sim.logger.info("{} is atempting to acquire tag {}".format( self, self.tag)) def release(self, tag=None): """ Sends out the release tag RPC """ # Release all currently held tags if tag is None: tag = self.view[self] # Construct the tag to send out (if specified) if not isinstance(tag, (set, frozenset)): tag = frozenset([tag]) # Request the difference of the tags we already have tag = frozenset(self.view[self] - tag) # Request tag with all current tags self.send_tag_request(tag) # Log the tag release self.sim.logger.info("{} is atempting to release tag {}".format( self, tag)) def handle_session(self): """ Starts a session timer if one isn't running, otherwise resets the currently running session timer on an additional access. """ if not self.session: self.session = Timer( self.env, self.session_timeout, partial(self.on_session_timeout, self.env.now)) else: self.session = self.session.reset() def get_log_state(self, tag=None): """ Constructs a log state object for append entries responses, either for the current tag or simply the current view. """ if tag is None: tag = [obj for view in self.view.values() for obj in view] return { obj: LogState(self.log[obj].lastApplied, self.log[obj].lastTerm, self.log[obj].commitIndex) for obj in tag } def send_tag_request(self, tag): """ Broadcasts a tag request for the passed in tag. """ # Change state to tagging and save tag locally self.state = State.TAGGING self.tag = tag # Request the entire tag in your current view. tagset = {owner.id: tagset for owner, tagset in self.view.items()} tagset[self.id] = self.tag # Send the tag request RPC to each neighbor rpc = RequestTag(self.epoch, tagset, self) for neighbor in self.neighbors(): self.send(neighbor, rpc) def send_append_entries(self, target=None): """ Helper function to send append entries to quorum or a specific node. Note: fails silently if target is not in the neighbors list. """ # ownership check if not self.state == State.OWNER: return # Go through follower list. for node, objs in self.nextIndex.iteritems(): # Filter based on the target supplied. if target is not None and node != target: continue # Construct the entries, or empty for heartbeat # The tag contains the state of each item to be sent entries = defaultdict(list) tag = defaultdict(LogState) for obj, nidx in objs.items(): # A rule directly from the Raft paper if self.log[obj].lastApplied >= nidx: entries[obj] = self.log[obj][nidx:] # Compute the previous log index and term prevLogIndex = nidx - 1 prevLogTerm = self.log[obj][prevLogIndex].term commitIndex = self.log[obj].commitIndex # Create the tag state tag[obj] = LogState(prevLogIndex, prevLogTerm, commitIndex) # Send the append entries message self.send(node, AppendEntries(self.epoch, self.id, tag, entries)) ###################################################################### ## Event Handlers ###################################################################### def on_state_change(self): """ Setting the state decides how the Tag node will interact. """ # Do state specific tag modifications if self.state == State.READY: self.votes = None self.tag = None # Remove owner state self.nextIndex = None self.matchIndex = None # Also interrupt the heartbeat if self.heartbeat: self.heartbeat.stop() elif self.state == State.TAGGING: # Convert to tag acquisition/release self.epoch += 1 # Create election and vote for self self.votes = Election([node.id for node in self.quorum()]) self.votes.vote(self.id) # Also interrupt the heartbeat if self.heartbeat: self.heartbeat.stop() elif self.state == State.OWNER: # Create the next index and match index self.nextIndex = { node: { obj: self.log[obj].lastApplied + 1 for obj in self.view[self] } for node in self.neighbors() } self.matchIndex = { node: {obj: 0 for obj in self.view[self]} for node in self.neighbors() } else: raise SimulationException( "Unknown Tag Replica State: {!r} set on {}".format( state, self)) def on_heartbeat_timeout(self): """ Time to send a heartbeat message to all tags. """ if not self.state == State.OWNER: return # Send heartbeat or aggregated writes self.send_append_entries() def on_session_timeout(self, started): """ If the session times out then go ahead and release the tag. """ duration = self.env.now - started self.sim.logger.info("session on {} terminated at {} ({} ms)".format( self.id, self.env.now, duration)) self.sim.results.update('session length', (self.id, duration)) self.session = None self.release() def on_request_tag_rpc(self, msg): """ Respond to a request for a tag acquisition from a server. """ rpc = msg.value accept = True # The requested epoch must be less than or greater than local. if rpc.epoch < self.epoch: accept = False # Ensure that no one else owns the tag in your current view. for candidate, tagset in rpc.tag.items(): # Short circuit if not accept: break for tag in tagset: owner = self.find_owner(tag) if owner is not None and owner.id != candidate: accept = False break # Log the vote decision amsg = "accepted" if accept else "did not accept" lmsg = "{} {} tag [{}] for {}".format( self, amsg, ",".join(rpc.tag[rpc.candidate.id]), rpc.candidate.id) self.sim.logger.info(lmsg) # Send the vote response back to the tag requester return self.send(msg.source, TagResponse(self.epoch, accept)) def on_tag_response_rpc(self, msg): """ Handle the votes from tag requests to other nodes. """ rpc = msg.value if self.state == State.TAGGING: # If the epoch is greater than the current epoch if rpc.epoch > self.epoch: # Retry the tag request self.epoch = rpc.epoch self.send_tag_request(self.tag) self.sim.logger.info("{} retrying tag request for {}".format( self, self.tag)) # Exit: no more work required! return # Update the current election self.votes.vote(msg.source.id, rpc.accept) if self.votes.has_passed(): # Update our local tag and become owner. if self.tag: self.state = State.OWNER self.view[self] = set(self.tag) else: self.state = State.READY # Send out the ownership change append entries self.send_append_entries() # Log the new tag owner self.sim.logger.info("{} tag goes to: {}".format( self, self.view[self])) # Record tag length over time self.sim.results.update( 'tag size', (self.id, self.env.now, len(self.view[self]))) elif self.state in (State.READY, State.OWNER): # Ignore vote responses if we've changed our state return else: raise TagRPCException( "Tag request response in unknown state: '{}'".format( self.state)) def on_append_entries_rpc(self, msg): rpc = msg.value # reply false if the epoch < current epoch if rpc.epoch < self.epoch: self.sim.logger.info( "{} doesn't accept append entries in epoch {} for epoch {}". format(self, self.epoch, rpc.epoch)) # Send back the request that you made originally. return self.send( msg.source, AEResponse(self.epoch, {obj: False for obj in rpc.tag.keys()}, rpc.tag, Reason.EPOCH)) # Update the view to match the view of the append entries # Update the epoch to match the rpc of the append entries self.view[msg.source] = set(rpc.tag.keys()) if self.epoch < rpc.epoch: self.epoch = rpc.epoch # Now for each object in the RPC, perform Raft-like append entries. # The success tracking is a complete tracking for all objects, will # return false even if we need to update the log for only one thing. # We will reply back with a state object that has per-object details. success = defaultdict(bool) state = defaultdict(LogState) for obj, prev in rpc.tag.items(): entries = rpc.entries[obj] objlog = self.log[obj] # If log doesn't contain an entry at prev index matching epoch. if objlog.lastApplied < prev.index or objlog[ prev.index].term != prev.epoch: # Perform the logging of this state failure if objlog.lastApplied < prev.index: self.sim.logger.info( "{} doesn't accept append to {} index {} where last applied is {}" .format(self, obj, prev.index, objlog.lastApplied)) else: self.sim.logger.info( "{} doesn't accept append to {} due to epoch mismatch: {} vs {}" .format(self, obj, prev.epoch, objlog[prev.index].term)) # Mark that there is a problem and continue success[obj] = False state[obj] = LogState(objlog.lastApplied, objlog.lastTerm, objlog.lastCommit) continue # At this point the entries are accepted because of continue statements if entries: if objlog.lastApplied >= prev.index: # If existing entry conflicts with a new one (same index, different epochs) # Delete the existing entry and all that follow it. if objlog[prev.index].term != prev.epoch: objlog.truncate(prev.index) if objlog.lastApplied > prev.index: # Better look into what's happening here! raise TagRPCException( "{} is possibly receiving duplicate append entries". format(self)) # Append any new entries not already in the log. for entry in entries: # Add the entry/epoch to the log objlog.append(*entry) # Update the versions to compute visibilities entry[0].update(self) # Log the last write from the append entries self.sim.logger.debug( "appending {} entries to {} log on {} (term {}, commit {})" .format(len(entries), obj, self, objlog.lastTerm, objlog.commitIndex)) # Update the commit index and save the state of the object. if prev.commit > objlog.commitIndex: objlog.commitIndex = min(prev.commit, objlog.lastApplied) success[obj] = True state[obj] = LogState(objlog.lastApplied, objlog.lastTerm, objlog.lastCommit) # Return the response back to the owner reason = Reason.OK if all(success.values()) else Reason.LOG return self.send(msg.source, AEResponse(self.epoch, success, state, reason)) def on_ae_response_rpc(self, msg): """ Handles acknowledgment of append entries messages. """ rpc = msg.value retry = False if self.state == State.OWNER: # Update state of followers in the tag group for obj, success in rpc.success.items(): if success: self.nextIndex[msg.source][obj] = rpc.tag[obj].index + 1 self.matchIndex[msg.source][obj] = rpc.tag[obj].index else: # If the epoch is not the same, update accordingly. if rpc.epoch > self.epoch: self.epoch = rpc.epoch # If the failure was because of the epoch, simply retry. if rpc.reason == Reason.EPOCH: retry = True # Otherwise decrement the next index and to retry elif rpc.reason == Reason.LOG: self.nextIndex[msg.source][obj] -= 1 retry = True else: raise TagRPCException( "Unknown append entries failure reason: {}".format( rpc.reason)) # Determine if we can commit the entry for obj, state in rpc.tag.items(): log = self.log[obj] for n in xrange(log.lastApplied, log.commitIndex, -1): commit = Election(self.matchIndex.keys()) for node, objs in self.matchIndex.items(): match = objs[obj] commit.vote(node, match >= n) if commit.has_passed() and log[n].term == self.epoch: # Commit all versions from the last log to now. for idx in xrange(log.commitIndex, n + 1): if not log[idx].version: continue log[idx].version.update(self, commit=True) # Set the commit index and break log.commitIndex = n break # If retry, send append entries back to the source. if retry: self.send_append_entries(msg.source) elif self.state == State.TAGGING: # Determine if we need to retry the tagging again. if rpc.epoch > self.epoch: # Retry the tag request self.epoch = rpc.epoch self.send_tag_request(self.tag) self.sim.logger.info("{} retrying tag request for {}".format( self, self.tag)) return elif self.state == State.READY: # Ignore AE messages if we're not an owner anymore. return else: raise TagRPCException("Response in unknown state: '{}'".format( self.state)) def on_remote_access(self, msg): """ Handles remote writes to and from the replicas. """ access = msg.value.access # Ensure that we own the object if not self.owns(access.name): return self.send(msg.source, AccessResponse(self.epoch, False, access)) # If we do own the object, then respond: method = { 'read': self.read, 'write': self.write, }[access.type] # Call the remote method with the access. method(access) return self.send(msg.source, AccessResponse(self.epoch, True, access)) def on_access_response_rpc(self, msg): """ Handles responses to remote accesses. """ rpc = msg.value if rpc.success: rpc.access.complete()
class FloatedRaftReplica(RaftReplica): def __init__(self, simulation, **kwargs): ## Initialize the replica super(FloatedRaftReplica, self).__init__(simulation, **kwargs) # Anti entropy settings self.ae_delay = kwargs.get('anti_entropy_delay', ANTI_ENTROPY_DELAY) self.ae_timer = None self.ae_cache = [] @memoized def locations(self): """ Returns all the locations in the network with Raft nodes. """ return set( [node.location for node in self.neighbors(self.consistency)]) def quorum(self): """ Returns only nodes in the same location to do Raft consensus with. """ # Filter only connections that are in the same consistency group for node in self.neighbors(self.consistency): if node.location == self.location: yield node # Don't forget to yield self! yield self def remotes(self, location=None): """ Returns only nodes that are not in the same location to float writes to using anti-entropy. This method is only used by the leader. Can also specify a specific location to fetch the remotes for. Note that specifying your current location will not return nodes. """ # Filter only connections that are in the same consistency group for node in self.neighbors(self.consistency): if node.location != self.location: if location is not None and node.location != location: continue yield node def gossip(self): """ Randomly select a neighbor and exchange information about the state of the latest entries in the log since the last anti-entropy delay. """ # Gossip to one node at each location for location in self.locations: # Don't gossip to nodes in self! if location == self.location: continue # Select a random target to gossip to target = random.choice(list(self.remotes(location))) # Log the gossip that's happening self.sim.logger.debug("{} gossiping {} entries to {}".format( self, len(self.ae_cache), target)) entries = tuple([ Write(version.name, self, version) for version in self.ae_cache ]) # Send all the values in the cache. self.send(target, Gossip(entries, len(self.ae_cache), -1)) # Empty the cache on gossip self.ae_cache = [] # Reset the anti-entropy timer self.ae_timer = Timer(self.env, self.ae_delay, self.gossip) self.ae_timer.start() ###################################################################### ## Event Handlers ###################################################################### def on_state_change(self): """ Does the same stuff as super, but also - if leader; starts the anti entropy interval to do gossiping. """ super(FloatedRaftReplica, self).on_state_change() if self.state in (State.FOLLOWER, State.CANDIDATE): if hasattr(self, 'ae_timer') and self.ae_timer is not None: # Cancel the anti-entropy timer. self.ae_timer.stop() self.ae_timer = None elif self.state == State.LEADER: self.ae_timer = Timer(self.env, self.ae_delay, self.gossip) self.ae_timer.start() elif self.state == State.READY: # This happens on the call to super, just ignore for now. pass else: raise SimulationException( "Unknown Floating Raft State: {!r} set on {}".format( self.state, self)) def on_gossip_rpc(self, message): """ Handles the receipt of a gossip from another node. Expects multiple accesses (Write events) as entries. Goes through all and compares the versions, replying False only if there is an error or a conflict. """ entries = message.value.entries # Go through the entries from the RPC and write to local cluster. for access in entries: access.version.gossiped = True self.write(access) # Should we return with what's in our cache? # Respond to the sender self.send(message.source, GossipResponse([], 0, True, -1)) def on_response_rpc(self, message): """ Just receives the acknowledgment of the response. """ pass def on_ae_response_rpc(self, msg): """ Does the same stuff that the super handler does, but also caches commits to gossip about them later! """ rpc = msg.value if self.state == State.LEADER: if rpc.success: self.nextIndex[msg.source] = rpc.lastLogIndex + 1 self.matchIndex[msg.source] = rpc.lastLogIndex else: # Decrement next index and retry append entries self.nextIndex[msg.source] -= 1 self.send_append_entries(msg.source) # Decide if we can commit the entry for n in xrange(self.log.lastApplied, self.log.commitIndex, -1): commit = Election(self.matchIndex.keys()) for k, v in self.matchIndex.iteritems(): commit.vote(k, v >= n) if commit.has_passed() and self.log[n][1] == self.currentTerm: # Commit all versions from the last log entry to now. for idx in xrange(self.log.commitIndex, n + 1): if self.log[idx][0] is None: continue # Cache the version to anti-entropy! version = self.log[idx][0] if not hasattr(version, 'gossiped') or not version.gossiped: self.ae_cache.append(version) self.log[idx][0].update(self, commit=True) # Set the commit index and break self.log.commitIndex = n break elif self.state == State.CANDIDATE: # Decide whether or not to step down. if rpc.term >= self.currentTerm: ## Become a follower self.state = State.FOLLOWER ## Log the failed election self.sim.logger.info( "{} has stepped down as candidate".format(self)) return elif self.state == State.FOLLOWER: # Ignore AE messages if we are the follower. return else: raise RaftRPCException( "Append entries response in unknown state: '{}'".format( self.state))
class EventualReplica(Replica): def __init__(self, simulation, **kwargs): super(EventualReplica, self).__init__(simulation, **kwargs) # Eventually consistent settings self.ae_delay = kwargs.get("anti_entropy_delay", AE_DELAY) self.n_neighbors = kwargs.get("num_neighbors", NEIGHBORS) # Deprecated self.do_gossip = kwargs.get("do_gossip", DO_GOSSIP) self.do_rumoring = kwargs.get("do_rumoring", DO_RUMORING) self.log = MultiObjectWriteLog() # the write log of the replica self.timeout = None # anti entropy timer ###################################################################### ## Properties ###################################################################### ###################################################################### ## Core Methods (Replica API) ###################################################################### def read(self, name, **kwargs): """ Eventually consistent replicas simply return the latest version for the name that they have in their store. This easily could be stale or forked depending on writes elsewhere in the cluster. """ # Create the read event using super. access = super(EventualReplica, self).read(name, **kwargs) # Record the number of attempts for the access if access.is_local_to(self): access.attempts += 1 # Fetch the latest version from the log version = self.log.get_latest_version(access.name) # If version is None then we haven't read anything; bail! if version is None: return access.drop(empty=True) # Eventual nodes read locally and immediately, so complete the read. access.update(version, completed=True) # Log the access from this particular replica. access.log(self) return access def write(self, name, **kwargs): """ Performs a write to the object with the given name by first creating the access event using super. Note that other access events can be passed into the write method in the case of remote writes. The access will define if the write is local or not. If local: write to the latest local version and complete. If remote: append write to log if latest version of object else error. After local vs. remote do the following: 1. append the write to the log as (version, id) 2. cache the latest access for gossip or rumoring 3. update the version for visibility latency 4. call the rumor handler Note this method can raise an error if not writing the latest version. """ # Create the write event using super. access = super(EventualReplica, self).write(name, **kwargs) # Determine if the write is local or remote if access.is_local_to(self): # Record the number of attempts for the access access.attempts += 1 # Fetch the latest version from the log latest = self.log.get_latest_version(access.name) # Perform the write if latest is None: version = namespace(access.name)(self) else: version = latest.nextv(self) # Update the access with the latest version and complete access.update(version, completed=True) else: # If there is no version, raise an exception if access.version is None: raise AccessError("Attempting a remote write on {} without a version!".format(self)) # Save the version variable for use below version = access.version current = self.log.get_latest_version(access.name) # Ensure that the version is the latest. if current is not None and version <= current: raise AccessError("Attempting unordered write of {} after write of {}".format(version, current)) # At this point we've dealt with local vs. remote # Append the latest version to the local data store self.log.append(version, 0) # Handle the access according to eventual rules version.update(self) # Update the version to track visibility latency access.log(self) # Log the access from this particular replica. self.rumor(access) # Rumor the access on demand # Return the access for subclass access. return access def run(self): """ The run method basically implements an anti-entropy timer. """ while True: yield self.get_anti_entropy_timeout() ###################################################################### ## Helper Methods ###################################################################### def gossip(self): """ Pairwise gossip protocol by randomly selecting a neighbor and exchanging information about the state of the latest objects in the cache since the last anti-entropy delay. TODO: how to gossip to strong consistency nodes? """ # If gossiping is not allowed, forget about it. if not self.do_gossip: return # Perform pairwise anti-entropy sessions with n_neighbors for target in self.get_anti_entropy_neighbors(): # Send the latest version of ALL objects. entries = [self.log.get_latest_version(name).access for name in self.log.namespace] gossip = Gossip(tuple(entries), len(entries)) self.send(target, gossip) def rumor(self, access): """ Performs on access rumor mongering """ # if rumoring is not allowed, forget about it. if not self.do_rumoring: return # Send the access to n other neighbors (excluding the origin) for target in self.get_anti_entropy_neighbors(): rumor = Rumor(access) self.send(target, rumor) def get_anti_entropy_timeout(self): """ Creates the anti-entropy timeout. In the future this could be random timeout not fixed. """ self.timeout = Timer(self.env, self.ae_delay, self.gossip) return self.timeout.start() def select_anti_entropy_neighbor(self): """ Implements the anti-entropy neighbor selection policy. By default this is simply uniform random selection of all the eventual neighbors. """ return random.choice(self.neighbors(self.consistency)) def get_anti_entropy_neighbors(self): """ Selects the neighbors to perform anti-entropy with. """ for _ in xrange(self.n_neighbors): yield self.select_anti_entropy_neighbor() def update_forte_children(self, current, remote): """ This unfortunately named method is a recursive function that updates all the children of the remote version with the new forte number and returns the newly correct current version. The idea here is that if the current version has a lower forte number then we should update the children of the remote (higher forte) in order to make sure that the latest branch is current. This method provides backpressure from Raft to Eventual. """ def update_forte(forte, version, current): """ Recursive update the forte number for a particular version. """ # Update all the version's children with its forte number. for child in version.children: # Only update children that are in the current log. if child in self.log: # Update child forte to parent and detect current child.forte = forte if child > current: current = child # Recurse on grandchildren current = update_forte(forte, child, current) # Return the maximal version (using forte numbers) discovered. return current # This function only needs be called if we're in federated versioning. if settings.simulation.versioning != "federated": return current # If the current is greater than the remote, return it. if current is None or current >= remote: return current # Check the forte number on the remote and update the children. if remote.forte > current.forte: strong = update_forte(remote.forte, remote, current) if strong > current: # Put the strong version at the end of the log and return it # as the new current version (or latest for this object) if strong in self.log: self.log.remove(strong) self.log.append(strong, strong.forte) return strong else: # This really shouldn't happen?! self.sim.logger.warning("Attempting to move {} to end when not in log!") # Last resort, return the current version. return current ###################################################################### ## Event Handlers ###################################################################### def on_gossip_rpc(self, message): """ Handles the receipt of a gossip from another node. Expects multiple accesses (Write events) as entries. Goes through all and compares the versions, replying False only if there is an error or a conflict. """ entries = message.value.entries updates = [] # Go through the entries from the RPC and update log for access in entries: # Get the latest version from the log then update with forte current = self.log.get_latest_version(access.name) current = self.update_forte_children(current, access.version) # If the access is greater than our current version, write it! if current is None or access.version > current: self.write(access) # Is the the remote behind us? If so, send the latest version! elif access.version < current: updates.append(current.access) else: # Presumably the version are equal, so do nothing. continue # Success here just means whether or not we're responding with updates success = True if updates else False # Respond to the sender with the latest versions from our log self.send(message.source, GossipResponse(updates, len(updates), success)) def on_gossip_response_rpc(self, message): """ Handles the response to pairwise gossiping, updating entries from the responder's cache to the local log and latest version cache. """ entries = message.value.entries for access in entries: current = self.log.get_latest_version(access.name) current = self.update_forte_children(current, access.version) # This is a new version or a later version than our current. if current is None or access.version > current: self.write(access) def on_rumor_rpc(self, message): """ Handles the rumor message from the originator of the rumor. """ access = message.value.access current = self.log.get_latest_version(access.name) current = self.update_forte_children(current, access.version) # Is the rumored version later than our current? if current is None or access.version > current: # Write the access which will rumor it out again self.write(access) # Respond True to the origin of the rumor response = RumorResponse(None, True) elif access.version < current: # Respond False to the origin with the later version response = RumorResponse(current.access, False) else: # Simply acknowledge receipt response = RumorResponse(None, True) # Send the response back to the source self.send(message.source, response) def on_rumor_response_rpc(self, message): """ Handles the rumor acknowledgment """ response = message.value if not response.success: # This means that a later value has come in! current = self.log.get_latest_version(response.access.name) current = self.update_forte_children(current, access.version) # If their response is later than our version, write it. if current is None or response.access.version > current: self.write(response.access)
class RaftReplica(ConsensusReplica): def __init__(self, simulation, **kwargs): ## Initialize the replica super(RaftReplica, self).__init__(simulation, **kwargs) ## Initialize Raft Specific settings self.state = State.FOLLOWER self.currentTerm = 0 self.votedFor = None self.log = MultiObjectWriteLog() self.cache = {} ## Policies self.read_policy = ReadPolicy.get(kwargs.get("read_policy", READ_POLICY)) self.aggregate_writes = kwargs.get("aggregate_writes", AGGREGATE_WRITES) ## Timers for work eto = kwargs.get("election_timeout", ELECTION_TIMEOUT) hbt = kwargs.get("heartbeat_interval", HEARTBEAT_INTERVAL) self.timeout = ElectionTimer.fromReplica(self, eto) self.heartbeat = Timer(self.env, hbt, self.on_heartbeat_timeout) ## Leader state self.nextIndex = None self.matchIndex = None ###################################################################### ## Core Methods (Replica API) ###################################################################### def recv(self, event): """ Before dispatching the message to an RPC specific handler, there are some message-wide checks that need to occur. In this case the term must be inspected and if the replica is behind, become follower. """ message = event.value rpc = message.value # If RPC request or response contains term > currentTerm # Set currentTerm to term and convert to follower. if rpc.term > self.currentTerm: self.state = State.FOLLOWER self.currentTerm = rpc.term # Record the received message and dispatch to event handler return super(RaftReplica, self).recv(event) def read(self, name, **kwargs): """ Raft nodes perform a local read of the most recent commited version for the name passed in. Because the committed version could be stale (a new version is still waiting for 2 phase commit) a fork is possible but the Raft group will maintain full linearizability. """ # Create the read event using super. access = super(RaftReplica, self).read(name, **kwargs) # Record the number of attempts for the access if access.is_local_to(self): access.attempts += 1 # NOTE: Formerly, this was ALWAYS read commit not read latest, now # it is set by the read policy on the replica. We previously noted that # read committed was one of the key differences from eventual. version = self.read_via_policy(access.name) # If the version is None, that we haven't read anything! if version is None: return access.drop(empty=True) # Because this is a local read committed, complete the read. access.update(version, completed=True) # Log the access from this particular replica. access.log(self) return access def write(self, name, **kwargs): """ The write can be initiated on any replica server, including followers. Step one is to create the access event using super, which will give us the ability to detect local vs. remote writes. If the write is local: - create a new version from the latest write. - if follower: send a RemoteWrite with new version to the leader (write latency) store a cache copy so that followers can read their own writes. cached copy of the write goes away on AppendEntries. - if leader: append to log and complete (no leader latency) If the write is remote: - if follower: log warning and forward to leader - if leader: append to log but do not complete (complete at local) Check the committed vs. latest new versions. After local vs. remote do the following: 1. update the version for visibility latency 2. if leader send append entries """ access = super(RaftReplica, self).write(name, **kwargs) # Determine if the write is local or remote if access.is_local_to(self): # Record the number of attempts for the access access.attempts += 1 # Write a new version to the latest read by policy version = self.write_via_policy(access.name) # Update the access with the latest version access.update(version) # Log the access from this particular replica. access.log(self) if self.state == State.LEADER: # Append to log and complete if leader and local self.append_via_policy(access, complete=True) else: # Store the version in the cache and send remote write. self.cache[access.name] = version return self.send_remote_write(access) else: # Log the access from this particular replica. access.log(self) # If there is no version, raise an exception if access.version is None: raise AccessError("Attempting a remote write on {} without a version!".format(self)) # Save the version variable for use below. version = access.version if self.state == State.LEADER: # Append to log but do not complete since its remote self.append_via_policy(access, complete=False) else: # Remote write occurred from client to a follower self.sim.logger.info("remote write on follower node: {}".format(self)) # Store the version in the cache and send remote write. self.cache[access.name] = version return self.send_remote_write(access) # At this point we've dealt with local vs. remote, we should be the leader assert self.state == State.LEADER # Update the version to track visibility latency forte = True if settings.simulation.forte_on_append else False version.update(self, forte=forte) # Now do AppendEntries # Also interrupt the heartbeat since we just sent AppendEntries if not self.aggregate_writes: self.send_append_entries() self.heartbeat.stop() return access def run(self): """ Implements the Raft consensus protocol and elections. """ while True: if self.state in {State.FOLLOWER, State.CANDIDATE}: yield self.timeout.start() elif self.state == State.LEADER: yield self.heartbeat.start() else: raise SimulationException("Unknown Raft State: {!r} on {}".format(self.state, self)) ###################################################################### ## Helper Methods ###################################################################### def send_append_entries(self, target=None): """ Helper function to send append entries to quorum or a specific node. Note: fails silently if target is not in the neighbors list. """ # Leader check if not self.state == State.LEADER: return # Go through follower list. for node, nidx in self.nextIndex.iteritems(): # Filter based on the target supplied. if target is not None and node != target: continue # Construct the entries, or empty for heartbeat entries = [] if self.log.lastApplied >= nidx: entries = self.log[nidx:] # Compute the previous log index and term prevLogIndex = nidx - 1 prevLogTerm = self.log[prevLogIndex].term # Send the heartbeat message self.send( node, AppendEntries(self.currentTerm, self.id, prevLogIndex, prevLogTerm, entries, self.log.commitIndex) ) def send_remote_write(self, access): """ Helper function to send a remote write from a follower to leader. """ # Find the leader to perform the remote write. leader = self.get_leader_node() # If not leader, then drop the write if not leader: self.sim.logger.info("no leader: dropped write at {}".format(self)) return access.drop() # Send the remote write to the leader self.send(leader, RemoteWrite(self.currentTerm, access)) return access def get_leader_node(self): """ Searches for the leader amongst the neighbors. Raises an exception if there are multiple leaders, which is an extreme edge case. """ leaders = [node for node in self.quorum() if node.state == State.LEADER] if len(leaders) > 1: raise SimulationException("MutipleLeaders?!") elif len(leaders) < 1: return None else: return leaders[0] def read_via_policy(self, name): """ This method returns a version from either the log or the cache according to the read policy set on the replica server as follows: - COMMIT: return the latest commited version (ignoring cache) - LATEST: return latest version in log or in cache This method raises an exception on bad read policies. """ # If the policy is read committed, return the latest committed version if self.read_policy == ReadPolicy.COMMIT: return self.log.get_latest_commit(name) # If the policy is latest, read the latest and compare to cache. if self.read_policy == ReadPolicy.LATEST: # Get the latest version from the log (committed or not) version = self.log.get_latest_version(name) # If name in the cache and the cache version is greater, return it. if name in self.cache and version is not None: if self.cache[name] > version: return self.cache[name] # Return the latest version return version # If we've reached this point, we don't know what to do! raise SimulationException("Unknown read policy!") def write_via_policy(self, name): """ This method returns a new version incremented from either from the log or from the cache according to the read policy. It also handles any "new" writes, e.g. to objects that haven't been written yet. """ # Fetch the version from the log or the cache according to the # read policy. This implements READ COMMITTED/READ LATEST latest = self.read_via_policy(name) # Perform the write if latest is None: return namespace(name)(self) return latest.nextv(self) def append_via_policy(self, access, complete=False): """ This method is the gatekeeper for the log and can implement policies like "don't admit forks". It must drop the access if it doesn't meet the policy, and complete it if specified. NOTE: This is a leader-only method (followers have entries appended to their logs via AppendEntries) and will raise an exception if the node is not the leader. """ if self.state != State.LEADER: raise RaftRPCException("Append via policies called on a follower replica!") # The default policy is just append anything # NOTE: subclasses (as in Federated) can modify this self.log.append(access.version, self.currentTerm) # Complete the access if specified by the caller. if complete: access.complete() # Indicate that we've successfully appended to the log return True ###################################################################### ## Event Handlers ###################################################################### def on_state_change(self): """ When the state on a replica changes the internal state of the replica must also change, particularly the properties that define how the node interacts with RPC messages and client reads/writes. """ if self.state in (State.FOLLOWER, State.CANDIDATE): self.votedFor = None self.nextIndex = None self.matchIndex = None elif self.state == State.CANDIDATE: pass elif self.state == State.LEADER: self.nextIndex = {node: self.log.lastApplied + 1 for node in self.quorum() if node != self} self.matchIndex = {node: 0 for node in self.quorum() if node != self} elif self.state == State.READY: # This happens on the call to super, just ignore for now. pass else: raise SimulationException("Unknown Raft State: {!r} set on {}".format(self.state, self)) def on_heartbeat_timeout(self): """ Callback for when a heartbeat timeout occurs, for AppendEntries RPC. """ if not self.state == State.LEADER: return # Send heartbeat or aggregated writes self.send_append_entries() def on_election_timeout(self): """ Callback for when an election timeout occurs, e.g. become candidate. """ # Set state to candidate self.state = State.CANDIDATE # Create Election and vote for self self.currentTerm += 1 self.votes = Election([node.id for node in self.quorum()]) self.votes.vote(self.id) self.votedFor = self.id # Inform the rest of the quorum you'd like their vote. rpc = RequestVote(self.currentTerm, self.id, self.log.lastApplied, self.log.lastTerm) for follower in self.quorum(): if follower == self: continue self.send(follower, rpc) # Log the newly formed candidacy self.sim.logger.info("{} is now a leader candidate".format(self)) def on_request_vote_rpc(self, msg): """ Callback for RequestVote RPC call. """ rpc = msg.value if rpc.term >= self.currentTerm: if self.votedFor is None or self.votedFor == rpc.candidateId: if self.log.as_up_to_date(rpc.lastLogTerm, rpc.lastLogIndex): self.sim.logger.info("{} voting for {}".format(self, rpc.candidateId)) self.timeout.stop() self.votedFor = rpc.candidateId return self.send(msg.source, VoteResponse(self.currentTerm, True)) return self.send(msg.source, VoteResponse(self.currentTerm, False)) def on_vote_response_rpc(self, msg): """ Callback for AppendEntries and RequestVote RPC response. """ rpc = msg.value if self.state == State.CANDIDATE: # Update the current election self.votes.vote(msg.source.id, rpc.voteGranted) if self.votes.has_passed(): ## Become the leader self.state = State.LEADER self.timeout.stop() ## Send the leadership change append entries self.send_append_entries() ## Log the new leader self.sim.logger.info("{} has become raft leader".format(self)) return elif self.state in (State.FOLLOWER, State.LEADER): # Ignore vote responses if we've already been elected. return else: raise RaftRPCException("Vote response in unknown state: '{}'".format(self.state)) def on_append_entries_rpc(self, msg): """ Callback for the AppendEntries RPC call. """ rpc = msg.value # Stop the election timeout self.timeout.stop() # Reply false if term < current term if rpc.term < self.currentTerm: self.sim.logger.info("{} doesn't accept write on term {}".format(self, self.currentTerm)) return self.send(msg.source, AEResponse(self.currentTerm, False, self.log.lastApplied, self.log.lastCommit)) # Reply false if log doesn't contain an entry at prevLogIndex whose # term matches previous log term. if self.log.lastApplied < rpc.prevLogIndex or self.log[rpc.prevLogIndex][1] != rpc.prevLogTerm: if self.log.lastApplied < rpc.prevLogIndex: self.sim.logger.info( "{} doesn't accept write on index {} where last applied is {}".format( self, rpc.prevLogIndex, self.log.lastApplied ) ) else: self.sim.logger.info( "{} doesn't accept write for term mismatch {} vs {}".format( self, rpc.prevLogTerm, self.log[rpc.prevLogIndex][1] ) ) return self.send(msg.source, AEResponse(self.currentTerm, False, self.log.lastApplied, self.log.lastCommit)) # At this point AppendEntries RPC is accepted if rpc.entries: if self.log.lastApplied >= rpc.prevLogIndex: # If existing entry conflicts with new one (same index, different terms) # Delete the existing entry and all that follow it. if self.log[rpc.prevLogIndex][1] != rpc.prevLogTerm: self.log.truncate(rpc.prevLogIndex) if self.log.lastApplied > rpc.prevLogIndex: # Otherwise this could be a message that is sent again # raise RaftRPCException( # "{} is possibly receiving a duplicate append entries!".format(self) # ) self.sim.logger.warn("{} is possibly receiving a duplicate append entries!".format(self)) return self.send( msg.source, AEResponse(self.currentTerm, True, self.log.lastApplied, self.log.lastCommit) ) # Append any new entries not already in the log. for entry in rpc.entries: # Add the entry/term to the log self.log.append(*entry) self.sim.logger.debug("appending {} to {} on {}".format(entry[0], entry[1], self)) # Update the versions to compute visibilities entry[0].update(self) # Log the last write from the append entries. self.sim.logger.debug( "{} writes {} at idx {} (term {}, commit {})".format( self, self.log.lastVersion, self.log.lastApplied, self.log.lastTerm, self.log.commitIndex ) ) # If leaderCommit > commitIndex, update commit Index if rpc.leaderCommit > self.log.commitIndex: self.log.commitIndex = min(rpc.leaderCommit, self.log.lastApplied) # Return success response. return self.send(msg.source, AEResponse(self.currentTerm, True, self.log.lastApplied, self.log.lastCommit)) def on_ae_response_rpc(self, msg): """ Handles acknowledgment of append entries message. """ rpc = msg.value if self.state == State.LEADER: if rpc.success: self.nextIndex[msg.source] = rpc.lastLogIndex + 1 self.matchIndex[msg.source] = rpc.lastLogIndex else: # Decrement next index and retry append entries # Ensure to floor the nextIndex to 1 (the start of the log). nidx = self.nextIndex[msg.source] - 1 self.nextIndex[msg.source] = max(nidx, 1) self.send_append_entries(msg.source) # Decide if we can commit the entry for n in xrange(self.log.lastApplied, self.log.commitIndex, -1): commit = Election(self.matchIndex.keys()) for k, v in self.matchIndex.iteritems(): commit.vote(k, v >= n) if commit.has_passed() and self.log[n][1] == self.currentTerm: # Commit all versions from the last log entry to now. for idx in xrange(self.log.commitIndex, n + 1): if self.log[idx][0] is None: continue forte = True if settings.simulation.forte_on_commit else False self.log[idx][0].update(self, commit=True, forte=forte) # Set the commit index and break self.log.commitIndex = n break elif self.state == State.CANDIDATE: # Decide whether or not to step down. if rpc.term >= self.currentTerm: ## Become a follower self.state = State.FOLLOWER ## Log the failed election self.sim.logger.info("{} has stepped down as candidate".format(self)) return elif self.state == State.FOLLOWER: # Ignore AE messages if we are the follower. return else: raise RaftRPCException("Append entries response in unknown state: '{}'".format(self.state)) def on_remote_write_rpc(self, message): """ Unpacks the version from the remote write and initiates a local write. """ # Write the access from the remote replica access = message.value.version self.write(access) # Check if the access was dropped (e.g. the write failed) success = not access.is_dropped() # Send the write response self.send(message.source, WriteResponse(self.currentTerm, success, access)) def on_write_response_rpc(self, message): """ Completes the write if the remote write was successful. """ rpc = message.value if rpc.success: rpc.access.complete()
class EventualReplica(Replica): def __init__(self, simulation, **kwargs): super(EventualReplica, self).__init__(simulation, **kwargs) # Eventually consistent settings self.ae_delay = kwargs.get('anti_entropy_delay', AE_DELAY) self.n_neighbors = kwargs.get('num_neighbors', NEIGHBORS) # Deprecated self.do_gossip = kwargs.get('do_gossip', DO_GOSSIP) self.do_rumoring = kwargs.get('do_rumoring', DO_RUMORING) self.log = MultiObjectWriteLog() # the write log of the replica self.timeout = None # anti entropy timer ###################################################################### ## Properties ###################################################################### ###################################################################### ## Core Methods (Replica API) ###################################################################### def read(self, name, **kwargs): """ Eventually consistent replicas simply return the latest version for the name that they have in their store. This easily could be stale or forked depending on writes elsewhere in the cluster. """ # Create the read event using super. access = super(EventualReplica, self).read(name, **kwargs) # Record the number of attempts for the access if access.is_local_to(self): access.attempts += 1 # Fetch the latest version from the log version = self.log.get_latest_version(access.name) # If version is None then we haven't read anything; bail! if version is None: return access.drop(empty=True) # Eventual nodes read locally and immediately, so complete the read. access.update(version, completed=True) # Log the access from this particular replica. access.log(self) return access def write(self, name, **kwargs): """ Performs a write to the object with the given name by first creating the access event using super. Note that other access events can be passed into the write method in the case of remote writes. The access will define if the write is local or not. If local: write to the latest local version and complete. If remote: append write to log if latest version of object else error. After local vs. remote do the following: 1. append the write to the log as (version, id) 2. cache the latest access for gossip or rumoring 3. update the version for visibility latency 4. call the rumor handler Note this method can raise an error if not writing the latest version. """ # Create the write event using super. access = super(EventualReplica, self).write(name, **kwargs) # Determine if the write is local or remote if access.is_local_to(self): # Record the number of attempts for the access access.attempts += 1 # Fetch the latest version from the log latest = self.log.get_latest_version(access.name) # Perform the write if latest is None: version = namespace(access.name)(self) else: version = latest.nextv(self) # Update the access with the latest version and complete access.update(version, completed=True) else: # If there is no version, raise an exception if access.version is None: raise AccessError( "Attempting a remote write on {} without a version!".format(self) ) # Save the version variable for use below version = access.version current = self.log.get_latest_version(access.name) # Ensure that the version is the latest. if current is not None and version <= current: raise AccessError( "Attempting unordered write of {} after write of {}".format(version, current) ) # At this point we've dealt with local vs. remote # Append the latest version to the local data store self.log.append(version, 0) # Handle the access according to eventual rules version.update(self) # Update the version to track visibility latency access.log(self) # Log the access from this particular replica. self.rumor(access) # Rumor the access on demand # Return the access for subclass access. return access def run(self): """ The run method basically implements an anti-entropy timer. """ while True: yield self.get_anti_entropy_timeout() ###################################################################### ## Helper Methods ###################################################################### def gossip(self): """ Pairwise gossip protocol by randomly selecting a neighbor and exchanging information about the state of the latest objects in the cache since the last anti-entropy delay. TODO: how to gossip to strong consistency nodes? """ # If gossiping is not allowed, forget about it. if not self.do_gossip: return # Perform pairwise anti-entropy sessions with n_neighbors for target in self.get_anti_entropy_neighbors(): # Send the latest version of ALL objects. entries = [ self.log.get_latest_version(name).access for name in self.log.namespace ] gossip = Gossip(tuple(entries), len(entries)) self.send(target, gossip) def rumor(self, access): """ Performs on access rumor mongering """ # if rumoring is not allowed, forget about it. if not self.do_rumoring: return # Send the access to n other neighbors (excluding the origin) for target in self.get_anti_entropy_neighbors(): rumor = Rumor(access) self.send(target, rumor) def get_anti_entropy_timeout(self): """ Creates the anti-entropy timeout. In the future this could be random timeout not fixed. """ self.timeout = Timer(self.env, self.ae_delay, self.gossip) return self.timeout.start() def select_anti_entropy_neighbor(self): """ Implements the anti-entropy neighbor selection policy. By default this is simply uniform random selection of all the eventual neighbors. """ return random.choice(self.neighbors(self.consistency)) def get_anti_entropy_neighbors(self): """ Selects the neighbors to perform anti-entropy with. """ for _ in xrange(self.n_neighbors): yield self.select_anti_entropy_neighbor() def update_forte_children(self, current, remote): """ This unfortunately named method is a recursive function that updates all the children of the remote version with the new forte number and returns the newly correct current version. The idea here is that if the current version has a lower forte number then we should update the children of the remote (higher forte) in order to make sure that the latest branch is current. This method provides backpressure from Raft to Eventual. """ def update_forte(forte, version, current): """ Recursive update the forte number for a particular version. """ # Update all the version's children with its forte number. for child in version.children: # Only update children that are in the current log. if child in self.log: # Update child forte to parent and detect current child.forte = forte if child > current: current = child # Recurse on grandchildren current = update_forte(forte, child, current) # Return the maximal version (using forte numbers) discovered. return current # This function only needs be called if we're in federated versioning. if settings.simulation.versioning != "federated": return current # If the current is greater than the remote, return it. if current is None or current >= remote: return current # Check the forte number on the remote and update the children. if remote.forte > current.forte: strong = update_forte(remote.forte, remote, current) if strong > current: # Put the strong version at the end of the log and return it # as the new current version (or latest for this object) if strong in self.log: self.log.remove(strong) self.log.append(strong, strong.forte) return strong else: # This really shouldn't happen?! self.sim.logger.warning( "Attempting to move {} to end when not in log!" ) # Last resort, return the current version. return current ###################################################################### ## Event Handlers ###################################################################### def on_gossip_rpc(self, message): """ Handles the receipt of a gossip from another node. Expects multiple accesses (Write events) as entries. Goes through all and compares the versions, replying False only if there is an error or a conflict. """ entries = message.value.entries updates = [] # Go through the entries from the RPC and update log for access in entries: # Get the latest version from the log then update with forte current = self.log.get_latest_version(access.name) current = self.update_forte_children(current, access.version) # If the access is greater than our current version, write it! if current is None or access.version > current: self.write(access) # Is the the remote behind us? If so, send the latest version! elif access.version < current: updates.append(current.access) else: # Presumably the version are equal, so do nothing. continue # Success here just means whether or not we're responding with updates success = True if updates else False # Respond to the sender with the latest versions from our log self.send(message.source, GossipResponse(updates, len(updates), success)) def on_gossip_response_rpc(self, message): """ Handles the response to pairwise gossiping, updating entries from the responder's cache to the local log and latest version cache. """ entries = message.value.entries for access in entries: current = self.log.get_latest_version(access.name) current = self.update_forte_children(current, access.version) # This is a new version or a later version than our current. if current is None or access.version > current: self.write(access) def on_rumor_rpc(self, message): """ Handles the rumor message from the originator of the rumor. """ access = message.value.access current = self.log.get_latest_version(access.name) current = self.update_forte_children(current, access.version) # Is the rumored version later than our current? if current is None or access.version > current: # Write the access which will rumor it out again self.write(access) # Respond True to the origin of the rumor response = RumorResponse(None, True) elif access.version < current: # Respond False to the origin with the later version response = RumorResponse(current.access, False) else: # Simply acknowledge receipt response = RumorResponse(None, True) # Send the response back to the source self.send(message.source, response) def on_rumor_response_rpc(self, message): """ Handles the rumor acknowledgment """ response = message.value if not response.success: # This means that a later value has come in! current = self.log.get_latest_version(response.access.name) current = self.update_forte_children(current, access.version) # If their response is later than our version, write it. if current is None or response.access.version > current: self.write(response.access)
def write(self, name, **kwargs): """ When a replica performs a write it needs to decide if it can write to the tag locally, can acquire a tag for this object, or if it has to do something else like drop, wait, or remote write. If the access is local: - if the replica owns the tag, append and complete - if someone else owns the tag then drop, wait, or remote - if no one owns the tag, then attempt to acquire it If access is remote: - if we own the tag, then append but do not complete (at local) - if someone else owns the tag, log and forward to owner - if no one owns the tag then respond false """ # Create the read event using super. access = super(TagReplica, self).write(name, **kwargs) # Increase the session on access. self.handle_session() # Determine if the write is local or remote if access.is_local_to(self): # Record the number of attempts for the access access.attempts += 1 # Fetch the latest version from the log. latest = self.log[access.name].lastVersion # Perform the write if latest is None: version = namespace(access.name)(self) else: version = latest.nextv(self) # Update the access with the latest version access.update(version) else: # If there is no version, raise an exception if access.version is None: raise AccessError( "Attempting a remote write on {} without a version!". format(self)) # Save the version variable for use below. version = access.version # Log the access at this replica access.log(self) # Are we the owner of this tag? if self.owns(access.name): # Perform the append entries self.log[name].append(version, self.epoch) # Update the version to track visibility latency version.update(self) # Complete the access if it was local if access.is_local_to(self): access.complete() # Now do AppendEntries # Also interrupt the heartbeat since we just sent AppendEntries if not settings.simulation.aggregate_writes: self.send_append_entries() if self.heartbeat: self.heartbeat.stop() return access # Is there a different owner for the tag? owner = self.find_owner(name) if owner is not None: # Right now just drop the write on its face. self.sim.logger.info("ownership conflict: dropped {} at {}".format( access, self)) return access.drop() # We're going to acquire the tag! else: # We're going to have some write latency, retry the write. retry = Timer(self.env, self.heartbeat_interval, lambda: self.write(access)).start() # Request the ownership of the tag self.acquire(access.name) return access
class TagReplica(ConsensusReplica): def __init__(self, simulation, **kwargs): ## Timers for work self.session_timeout = kwargs.get('session_timeout', SESSION_TIMEOUT) self.heartbeat_interval = kwargs.get('heartbeat_interval', HEARTBEAT_INTERVAL) self.session = None self.heartbeat = None ## Initialze the tag specific settings self.epoch = 0 self.log = defaultdict(WriteLog) self.view = defaultdict(set) ## Owner state self.nextIndex = None self.matchIndex = None ## Initialize the replica super(TagReplica, self).__init__(simulation, **kwargs) self.state = State.READY ###################################################################### ## Core Methods (Replica API) ###################################################################### def read(self, name, **kwargs): """ When a tag replica performs a read it has to decide whether or not to read locally or to make a remote read across the cluster. Convert the read into an access, then check if we own the object. If we do, then return the latest commit. If we don't and no one else does either, attempt to acquire the tag. If we don't and someone else does then either drop, wait, or remote. Current implementation: #2, MR, no remote access. If someone else owns tag, reads are dropped. TODO: Remote vs Local Reads """ # Create the read event using super. access = super(TagReplica, self).read(name, **kwargs) # Record the number of attempts for the access if access.is_local_to(self): access.attempts += 1 # Increase the session on access. self.handle_session() # Are we the owner of this tag? if self.owns(access.name): # TODO: Change to last commit! version = self.log[access.name].lastVersion # If the version is None, bail since we haven't read anything if version is None: return access.drop(empty=True) # Update the version, complete the read, and log the access access.update(version, completed=True) access.log(self) # Return, we're done reading! return access # Is there a different owner for the tag? owner = self.find_owner(access.name) if owner is not None: # Right now just drop the read on its face. self.sim.logger.info( "ownership conflict: dropped {} at {}".format(access, self) ) return access.drop() # We're going to acquire the tag! else: # Log the access from this particular replica. access.log(self) # We're going to have some read latency, retry the read. retry = Timer( self.env, self.heartbeat_interval, lambda: self.read(access) ).start() if access.attempts <= 1 and self.state != State.TAGGING: # Request the ownership of the tag self.acquire(access.name) return access def write(self, name, **kwargs): """ When a replica performs a write it needs to decide if it can write to the tag locally, can acquire a tag for this object, or if it has to do something else like drop, wait, or remote write. If the access is local: - if the replica owns the tag, append and complete - if someone else owns the tag then drop, wait, or remote - if no one owns the tag, then attempt to acquire it If access is remote: - if we own the tag, then append but do not complete (at local) - if someone else owns the tag, log and forward to owner - if no one owns the tag then respond false """ # Create the read event using super. access = super(TagReplica, self).write(name, **kwargs) # Increase the session on access. self.handle_session() # Determine if the write is local or remote if access.is_local_to(self): # Record the number of attempts for the access access.attempts += 1 # Fetch the latest version from the log. latest = self.log[access.name].lastVersion # Perform the write if latest is None: version = namespace(access.name)(self) else: version = latest.nextv(self) # Update the access with the latest version access.update(version) else: # If there is no version, raise an exception if access.version is None: raise AccessError( "Attempting a remote write on {} without a version!".format(self) ) # Save the version variable for use below. version = access.version # Log the access at this replica access.log(self) # Are we the owner of this tag? if self.owns(access.name): # Perform the append entries self.log[name].append(version, self.epoch) # Update the version to track visibility latency version.update(self) # Complete the access if it was local if access.is_local_to(self): access.complete() # Now do AppendEntries # Also interrupt the heartbeat since we just sent AppendEntries if not settings.simulation.aggregate_writes: self.send_append_entries() if self.heartbeat: self.heartbeat.stop() return access # Is there a different owner for the tag? owner = self.find_owner(name) if owner is not None: # Right now just drop the write on its face. self.sim.logger.info( "ownership conflict: dropped {} at {}".format(access, self) ) return access.drop() # We're going to acquire the tag! else: # We're going to have some write latency, retry the write. retry = Timer( self.env, self.heartbeat_interval, lambda: self.write(access) ).start() # Request the ownership of the tag self.acquire(access.name) return access def run(self): """ We have to check in at every heartbeat interval. If we own a tag then send a heartbeat message, otherwise just keep quiescing. """ while True: if self.state == State.OWNER: self.heartbeat = Timer( self.env, self.heartbeat_interval, self.on_heartbeat_timeout ) yield self.heartbeat.start() else: yield self.env.timeout(self.heartbeat_interval) ###################################################################### ## Helper Methods ###################################################################### def owns(self, name): """ Returns True if the name is in the current view for that owner. """ return name in self.view[self] def find_owner(self, name): """ Looks up the owner of the name in the current view. Returns None if there is no owner fo the tag. """ for owner, tag in self.view.items(): if name in tag: return owner return None def acquire(self, tag): """ Sends out the acquire tag RPC """ # Construct the tag to send out if not isinstance(tag, (set, frozenset)): tag = frozenset([tag]) # Make sure to request the tag we already have tag = frozenset(self.view[self] | tag) # Request tag with all current tags self.send_tag_request(tag) # Log the tag acquisition self.sim.logger.info( "{} is atempting to acquire tag {}".format(self, self.tag) ) def release(self, tag=None): """ Sends out the release tag RPC """ # Release all currently held tags if tag is None: tag = self.view[self] # Construct the tag to send out (if specified) if not isinstance(tag, (set, frozenset)): tag = frozenset([tag]) # Request the difference of the tags we already have tag = frozenset(self.view[self] - tag) # Request tag with all current tags self.send_tag_request(tag) # Log the tag release self.sim.logger.info( "{} is atempting to release tag {}".format(self, tag) ) def handle_session(self): """ Starts a session timer if one isn't running, otherwise resets the currently running session timer on an additional access. """ if not self.session: self.session = Timer( self.env, self.session_timeout, partial(self.on_session_timeout, self.env.now) ) else: self.session = self.session.reset() def get_log_state(self, tag=None): """ Constructs a log state object for append entries responses, either for the current tag or simply the current view. """ if tag is None: tag = [obj for view in self.view.values() for obj in view] return { obj: LogState( self.log[obj].lastApplied, self.log[obj].lastTerm, self.log[obj].commitIndex ) for obj in tag } def send_tag_request(self, tag): """ Broadcasts a tag request for the passed in tag. """ # Change state to tagging and save tag locally self.state = State.TAGGING self.tag = tag # Request the entire tag in your current view. tagset = { owner.id: tagset for owner, tagset in self.view.items() } tagset[self.id] = self.tag # Send the tag request RPC to each neighbor rpc = RequestTag(self.epoch, tagset, self) for neighbor in self.neighbors(): self.send(neighbor, rpc) def send_append_entries(self, target=None): """ Helper function to send append entries to quorum or a specific node. Note: fails silently if target is not in the neighbors list. """ # ownership check if not self.state == State.OWNER: return # Go through follower list. for node, objs in self.nextIndex.iteritems(): # Filter based on the target supplied. if target is not None and node != target: continue # Construct the entries, or empty for heartbeat # The tag contains the state of each item to be sent entries = defaultdict(list) tag = defaultdict(LogState) for obj, nidx in objs.items(): # A rule directly from the Raft paper if self.log[obj].lastApplied >= nidx: entries[obj] = self.log[obj][nidx:] # Compute the previous log index and term prevLogIndex = nidx - 1 prevLogTerm = self.log[obj][prevLogIndex].term commitIndex = self.log[obj].commitIndex # Create the tag state tag[obj] = LogState(prevLogIndex, prevLogTerm, commitIndex) # Send the append entries message self.send( node, AppendEntries( self.epoch, self.id, tag, entries ) ) ###################################################################### ## Event Handlers ###################################################################### def on_state_change(self): """ Setting the state decides how the Tag node will interact. """ # Do state specific tag modifications if self.state == State.READY: self.votes = None self.tag = None # Remove owner state self.nextIndex = None self.matchIndex = None # Also interrupt the heartbeat if self.heartbeat: self.heartbeat.stop() elif self.state == State.TAGGING: # Convert to tag acquisition/release self.epoch += 1 # Create election and vote for self self.votes = Election([node.id for node in self.quorum()]) self.votes.vote(self.id) # Also interrupt the heartbeat if self.heartbeat: self.heartbeat.stop() elif self.state == State.OWNER: # Create the next index and match index self.nextIndex = { node: { obj: self.log[obj].lastApplied + 1 for obj in self.view[self] } for node in self.neighbors() } self.matchIndex = { node: { obj: 0 for obj in self.view[self] } for node in self.neighbors() } else: raise SimulationException( "Unknown Tag Replica State: {!r} set on {}".format(state, self) ) def on_heartbeat_timeout(self): """ Time to send a heartbeat message to all tags. """ if not self.state == State.OWNER: return # Send heartbeat or aggregated writes self.send_append_entries() def on_session_timeout(self, started): """ If the session times out then go ahead and release the tag. """ duration = self.env.now - started self.sim.logger.info( "session on {} terminated at {} ({} ms)".format( self.id, self.env.now, duration ) ) self.sim.results.update( 'session length', (self.id, duration) ) self.session = None self.release() def on_request_tag_rpc(self, msg): """ Respond to a request for a tag acquisition from a server. """ rpc = msg.value accept = True # The requested epoch must be less than or greater than local. if rpc.epoch < self.epoch: accept = False # Ensure that no one else owns the tag in your current view. for candidate, tagset in rpc.tag.items(): # Short circuit if not accept: break for tag in tagset: owner = self.find_owner(tag) if owner is not None and owner.id != candidate: accept = False break # Log the vote decision amsg = "accepted" if accept else "did not accept" lmsg = "{} {} tag [{}] for {}".format( self, amsg, ",".join(rpc.tag[rpc.candidate.id]), rpc.candidate.id ) self.sim.logger.info(lmsg) # Send the vote response back to the tag requester return self.send( msg.source, TagResponse(self.epoch, accept) ) def on_tag_response_rpc(self, msg): """ Handle the votes from tag requests to other nodes. """ rpc = msg.value if self.state == State.TAGGING: # If the epoch is greater than the current epoch if rpc.epoch > self.epoch: # Retry the tag request self.epoch = rpc.epoch self.send_tag_request(self.tag) self.sim.logger.info( "{} retrying tag request for {}".format(self, self.tag) ) # Exit: no more work required! return # Update the current election self.votes.vote(msg.source.id, rpc.accept) if self.votes.has_passed(): # Update our local tag and become owner. if self.tag: self.state = State.OWNER self.view[self] = set(self.tag) else: self.state = State.READY # Send out the ownership change append entries self.send_append_entries() # Log the new tag owner self.sim.logger.info( "{} tag goes to: {}".format(self, self.view[self]) ) # Record tag length over time self.sim.results.update( 'tag size', (self.id, self.env.now, len(self.view[self])) ) elif self.state in (State.READY, State.OWNER): # Ignore vote responses if we've changed our state return else: raise TagRPCException( "Tag request response in unknown state: '{}'".format(self.state) ) def on_append_entries_rpc(self, msg): rpc = msg.value # reply false if the epoch < current epoch if rpc.epoch < self.epoch: self.sim.logger.info( "{} doesn't accept append entries in epoch {} for epoch {}".format( self, self.epoch, rpc.epoch ) ) # Send back the request that you made originally. return self.send( msg.source, AEResponse( self.epoch, {obj: False for obj in rpc.tag.keys()}, rpc.tag, Reason.EPOCH ) ) # Update the view to match the view of the append entries # Update the epoch to match the rpc of the append entries self.view[msg.source] = set(rpc.tag.keys()) if self.epoch < rpc.epoch: self.epoch = rpc.epoch # Now for each object in the RPC, perform Raft-like append entries. # The success tracking is a complete tracking for all objects, will # return false even if we need to update the log for only one thing. # We will reply back with a state object that has per-object details. success = defaultdict(bool) state = defaultdict(LogState) for obj, prev in rpc.tag.items(): entries = rpc.entries[obj] objlog = self.log[obj] # If log doesn't contain an entry at prev index matching epoch. if objlog.lastApplied < prev.index or objlog[prev.index].term != prev.epoch: # Perform the logging of this state failure if objlog.lastApplied < prev.index: self.sim.logger.info( "{} doesn't accept append to {} index {} where last applied is {}".format( self, obj, prev.index, objlog.lastApplied ) ) else: self.sim.logger.info( "{} doesn't accept append to {} due to epoch mismatch: {} vs {}".format( self, obj, prev.epoch, objlog[prev.index].term ) ) # Mark that there is a problem and continue success[obj] = False state[obj] = LogState(objlog.lastApplied, objlog.lastTerm, objlog.lastCommit) continue # At this point the entries are accepted because of continue statements if entries: if objlog.lastApplied >= prev.index: # If existing entry conflicts with a new one (same index, different epochs) # Delete the existing entry and all that follow it. if objlog[prev.index].term != prev.epoch: objlog.truncate(prev.index) if objlog.lastApplied > prev.index: # Better look into what's happening here! raise TagRPCException( "{} is possibly receiving duplicate append entries".format(self) ) # Append any new entries not already in the log. for entry in entries: # Add the entry/epoch to the log objlog.append(*entry) # Update the versions to compute visibilities entry[0].update(self) # Log the last write from the append entries self.sim.logger.debug( "appending {} entries to {} log on {} (term {}, commit {})".format( len(entries), obj, self, objlog.lastTerm, objlog.commitIndex ) ) # Update the commit index and save the state of the object. if prev.commit > objlog.commitIndex: objlog.commitIndex = min(prev.commit, objlog.lastApplied) success[obj] = True state[obj] = LogState(objlog.lastApplied, objlog.lastTerm, objlog.lastCommit) # Return the response back to the owner reason = Reason.OK if all(success.values()) else Reason.LOG return self.send( msg.source, AEResponse(self.epoch, success, state, reason) ) def on_ae_response_rpc(self, msg): """ Handles acknowledgment of append entries messages. """ rpc = msg.value retry = False if self.state == State.OWNER: # Update state of followers in the tag group for obj, success in rpc.success.items(): if success: self.nextIndex[msg.source][obj] = rpc.tag[obj].index + 1 self.matchIndex[msg.source][obj] = rpc.tag[obj].index else: # If the epoch is not the same, update accordingly. if rpc.epoch > self.epoch: self.epoch = rpc.epoch # If the failure was because of the epoch, simply retry. if rpc.reason == Reason.EPOCH: retry = True # Otherwise decrement the next index and to retry elif rpc.reason == Reason.LOG: self.nextIndex[msg.source][obj] -= 1 retry = True else: raise TagRPCException( "Unknown append entries failure reason: {}".format(rpc.reason) ) # Determine if we can commit the entry for obj, state in rpc.tag.items(): log = self.log[obj] for n in xrange(log.lastApplied, log.commitIndex, -1): commit = Election(self.matchIndex.keys()) for node, objs in self.matchIndex.items(): match = objs[obj] commit.vote(node, match >= n) if commit.has_passed() and log[n].term == self.epoch: # Commit all versions from the last log to now. for idx in xrange(log.commitIndex, n+1): if not log[idx].version: continue log[idx].version.update(self, commit=True) # Set the commit index and break log.commitIndex = n break # If retry, send append entries back to the source. if retry: self.send_append_entries(msg.source) elif self.state == State.TAGGING: # Determine if we need to retry the tagging again. if rpc.epoch > self.epoch: # Retry the tag request self.epoch = rpc.epoch self.send_tag_request(self.tag) self.sim.logger.info( "{} retrying tag request for {}".format(self, self.tag) ) return elif self.state == State.READY: # Ignore AE messages if we're not an owner anymore. return else: raise TagRPCException( "Response in unknown state: '{}'".format(self.state) ) def on_remote_access(self, msg): """ Handles remote writes to and from the replicas. """ access = msg.value.access # Ensure that we own the object if not self.owns(access.name): return self.send( msg.source, AccessResponse(self.epoch, False, access) ) # If we do own the object, then respond: method = { 'read': self.read, 'write': self.write, }[access.type] # Call the remote method with the access. method(access) return self.send( msg.source, AccessResponse(self.epoch, True, access) ) def on_access_response_rpc(self, msg): """ Handles responses to remote accesses. """ rpc = msg.value if rpc.success: rpc.access.complete()
class RaftReplica(ConsensusReplica): def __init__(self, simulation, **kwargs): ## Initialize the replica super(RaftReplica, self).__init__(simulation, **kwargs) ## Initialize Raft Specific settings self.state = State.FOLLOWER self.currentTerm = 0 self.votedFor = None self.log = MultiObjectWriteLog() self.cache = {} ## Policies self.read_policy = ReadPolicy.get(kwargs.get('read_policy', READ_POLICY)) self.aggregate_writes = kwargs.get('aggregate_writes', AGGREGATE_WRITES) ## Timers for work eto = kwargs.get('election_timeout', ELECTION_TIMEOUT) hbt = kwargs.get('heartbeat_interval', HEARTBEAT_INTERVAL) self.timeout = ElectionTimer.fromReplica(self, eto) self.heartbeat = Timer(self.env, hbt, self.on_heartbeat_timeout) ## Leader state self.nextIndex = None self.matchIndex = None ###################################################################### ## Core Methods (Replica API) ###################################################################### def recv(self, event): """ Before dispatching the message to an RPC specific handler, there are some message-wide checks that need to occur. In this case the term must be inspected and if the replica is behind, become follower. """ message = event.value rpc = message.value # If RPC request or response contains term > currentTerm # Set currentTerm to term and convert to follower. if rpc.term > self.currentTerm: self.state = State.FOLLOWER self.currentTerm = rpc.term # Record the received message and dispatch to event handler return super(RaftReplica, self).recv(event) def read(self, name, **kwargs): """ Raft nodes perform a local read of the most recent commited version for the name passed in. Because the committed version could be stale (a new version is still waiting for 2 phase commit) a fork is possible but the Raft group will maintain full linearizability. """ # Create the read event using super. access = super(RaftReplica, self).read(name, **kwargs) # Record the number of attempts for the access if access.is_local_to(self): access.attempts += 1 # NOTE: Formerly, this was ALWAYS read commit not read latest, now # it is set by the read policy on the replica. We previously noted that # read committed was one of the key differences from eventual. version = self.read_via_policy(access.name) # If the version is None, that we haven't read anything! if version is None: return access.drop(empty=True) # Because this is a local read committed, complete the read. access.update(version, completed=True) # Log the access from this particular replica. access.log(self) return access def write(self, name, **kwargs): """ The write can be initiated on any replica server, including followers. Step one is to create the access event using super, which will give us the ability to detect local vs. remote writes. If the write is local: - create a new version from the latest write. - if follower: send a RemoteWrite with new version to the leader (write latency) store a cache copy so that followers can read their own writes. cached copy of the write goes away on AppendEntries. - if leader: append to log and complete (no leader latency) If the write is remote: - if follower: log warning and forward to leader - if leader: append to log but do not complete (complete at local) Check the committed vs. latest new versions. After local vs. remote do the following: 1. update the version for visibility latency 2. if leader send append entries """ access = super(RaftReplica, self).write(name, **kwargs) # Determine if the write is local or remote if access.is_local_to(self): # Record the number of attempts for the access access.attempts += 1 # Write a new version to the latest read by policy version = self.write_via_policy(access.name) # Update the access with the latest version access.update(version) # Log the access from this particular replica. access.log(self) if self.state == State.LEADER: # Append to log and complete if leader and local self.append_via_policy(access, complete=True) else: # Store the version in the cache and send remote write. self.cache[access.name] = version return self.send_remote_write(access) else: # Log the access from this particular replica. access.log(self) # If there is no version, raise an exception if access.version is None: raise AccessError( "Attempting a remote write on {} without a version!".format(self) ) # Save the version variable for use below. version = access.version if self.state == State.LEADER: # Append to log but do not complete since its remote self.append_via_policy(access, complete=False) else: # Remote write occurred from client to a follower self.sim.logger.info( "remote write on follower node: {}".format(self) ) # Store the version in the cache and send remote write. self.cache[access.name] = version return self.send_remote_write(access) # At this point we've dealt with local vs. remote, we should be the leader assert self.state == State.LEADER # Update the version to track visibility latency forte = True if settings.simulation.forte_on_append else False version.update(self, forte=forte) # Now do AppendEntries # Also interrupt the heartbeat since we just sent AppendEntries if not self.aggregate_writes: self.send_append_entries() self.heartbeat.stop() return access def run(self): """ Implements the Raft consensus protocol and elections. """ while True: if self.state in {State.FOLLOWER, State.CANDIDATE}: yield self.timeout.start() elif self.state == State.LEADER: yield self.heartbeat.start() else: raise SimulationException( "Unknown Raft State: {!r} on {}".format(self.state, self) ) ###################################################################### ## Helper Methods ###################################################################### def send_append_entries(self, target=None): """ Helper function to send append entries to quorum or a specific node. Note: fails silently if target is not in the neighbors list. """ # Leader check if not self.state == State.LEADER: return # Go through follower list. for node, nidx in self.nextIndex.iteritems(): # Filter based on the target supplied. if target is not None and node != target: continue # Construct the entries, or empty for heartbeat entries = [] if self.log.lastApplied >= nidx: entries = self.log[nidx:] # Compute the previous log index and term prevLogIndex = nidx - 1 prevLogTerm = self.log[prevLogIndex].term # Send the heartbeat message self.send( node, AppendEntries( self.currentTerm, self.id, prevLogIndex, prevLogTerm, entries, self.log.commitIndex ) ) def send_remote_write(self, access): """ Helper function to send a remote write from a follower to leader. """ # Find the leader to perform the remote write. leader = self.get_leader_node() # If not leader, then drop the write if not leader: self.sim.logger.info( "no leader: dropped write at {}".format(self) ) return access.drop() # Send the remote write to the leader self.send( leader, RemoteWrite(self.currentTerm, access) ) return access def get_leader_node(self): """ Searches for the leader amongst the neighbors. Raises an exception if there are multiple leaders, which is an extreme edge case. """ leaders = [ node for node in self.quorum() if node.state == State.LEADER ] if len(leaders) > 1: raise SimulationException("MutipleLeaders?!") elif len(leaders) < 1: return None else: return leaders[0] def read_via_policy(self, name): """ This method returns a version from either the log or the cache according to the read policy set on the replica server as follows: - COMMIT: return the latest commited version (ignoring cache) - LATEST: return latest version in log or in cache This method raises an exception on bad read policies. """ # If the policy is read committed, return the latest committed version if self.read_policy == ReadPolicy.COMMIT: return self.log.get_latest_commit(name) # If the policy is latest, read the latest and compare to cache. if self.read_policy == ReadPolicy.LATEST: # Get the latest version from the log (committed or not) version = self.log.get_latest_version(name) # If name in the cache and the cache version is greater, return it. if name in self.cache and version is not None: if self.cache[name] > version: return self.cache[name] # Return the latest version return version # If we've reached this point, we don't know what to do! raise SimulationException("Unknown read policy!") def write_via_policy(self, name): """ This method returns a new version incremented from either from the log or from the cache according to the read policy. It also handles any "new" writes, e.g. to objects that haven't been written yet. """ # Fetch the version from the log or the cache according to the # read policy. This implements READ COMMITTED/READ LATEST latest = self.read_via_policy(name) # Perform the write if latest is None: return namespace(name)(self) return latest.nextv(self) def append_via_policy(self, access, complete=False): """ This method is the gatekeeper for the log and can implement policies like "don't admit forks". It must drop the access if it doesn't meet the policy, and complete it if specified. NOTE: This is a leader-only method (followers have entries appended to their logs via AppendEntries) and will raise an exception if the node is not the leader. """ if self.state != State.LEADER: raise RaftRPCException( "Append via policies called on a follower replica!" ) # The default policy is just append anything # NOTE: subclasses (as in Federated) can modify this self.log.append(access.version, self.currentTerm) # Complete the access if specified by the caller. if complete: access.complete() # Indicate that we've successfully appended to the log return True ###################################################################### ## Event Handlers ###################################################################### def on_state_change(self): """ When the state on a replica changes the internal state of the replica must also change, particularly the properties that define how the node interacts with RPC messages and client reads/writes. """ if self.state in (State.FOLLOWER, State.CANDIDATE): self.votedFor = None self.nextIndex = None self.matchIndex = None elif self.state == State.CANDIDATE: pass elif self.state == State.LEADER: self.nextIndex = {node: self.log.lastApplied + 1 for node in self.quorum() if node != self} self.matchIndex = {node: 0 for node in self.quorum() if node != self} elif self.state == State.READY: # This happens on the call to super, just ignore for now. pass else: raise SimulationException( "Unknown Raft State: {!r} set on {}".format(self.state, self) ) def on_heartbeat_timeout(self): """ Callback for when a heartbeat timeout occurs, for AppendEntries RPC. """ if not self.state == State.LEADER: return # Send heartbeat or aggregated writes self.send_append_entries() def on_election_timeout(self): """ Callback for when an election timeout occurs, e.g. become candidate. """ # Set state to candidate self.state = State.CANDIDATE # Create Election and vote for self self.currentTerm += 1 self.votes = Election([node.id for node in self.quorum()]) self.votes.vote(self.id) self.votedFor = self.id # Inform the rest of the quorum you'd like their vote. rpc = RequestVote( self.currentTerm, self.id, self.log.lastApplied, self.log.lastTerm ) for follower in self.quorum(): if follower == self: continue self.send( follower, rpc ) # Log the newly formed candidacy self.sim.logger.info( "{} is now a leader candidate".format(self) ) def on_request_vote_rpc(self, msg): """ Callback for RequestVote RPC call. """ rpc = msg.value if rpc.term >= self.currentTerm: if self.votedFor is None or self.votedFor == rpc.candidateId: if self.log.as_up_to_date(rpc.lastLogTerm, rpc.lastLogIndex): self.sim.logger.info( "{} voting for {}".format(self, rpc.candidateId) ) self.timeout.stop() self.votedFor = rpc.candidateId return self.send( msg.source, VoteResponse(self.currentTerm, True) ) return self.send( msg.source, VoteResponse(self.currentTerm, False) ) def on_vote_response_rpc(self, msg): """ Callback for AppendEntries and RequestVote RPC response. """ rpc = msg.value if self.state == State.CANDIDATE: # Update the current election self.votes.vote(msg.source.id, rpc.voteGranted) if self.votes.has_passed(): ## Become the leader self.state = State.LEADER self.timeout.stop() ## Send the leadership change append entries self.send_append_entries() ## Log the new leader self.sim.logger.info( "{} has become raft leader".format(self) ) return elif self.state in (State.FOLLOWER, State.LEADER): # Ignore vote responses if we've already been elected. return else: raise RaftRPCException( "Vote response in unknown state: '{}'".format(self.state) ) def on_append_entries_rpc(self, msg): """ Callback for the AppendEntries RPC call. """ rpc = msg.value # Stop the election timeout self.timeout.stop() # Reply false if term < current term if rpc.term < self.currentTerm: self.sim.logger.info("{} doesn't accept write on term {}".format(self, self.currentTerm)) return self.send( msg.source, AEResponse(self.currentTerm, False, self.log.lastApplied, self.log.lastCommit) ) # Reply false if log doesn't contain an entry at prevLogIndex whose # term matches previous log term. if self.log.lastApplied < rpc.prevLogIndex or self.log[rpc.prevLogIndex][1] != rpc.prevLogTerm: if self.log.lastApplied < rpc.prevLogIndex: self.sim.logger.info( "{} doesn't accept write on index {} where last applied is {}".format( self, rpc.prevLogIndex, self.log.lastApplied ) ) else: self.sim.logger.info( "{} doesn't accept write for term mismatch {} vs {}".format( self, rpc.prevLogTerm, self.log[rpc.prevLogIndex][1] ) ) return self.send( msg.source, AEResponse(self.currentTerm, False, self.log.lastApplied, self.log.lastCommit) ) # At this point AppendEntries RPC is accepted if rpc.entries: if self.log.lastApplied >= rpc.prevLogIndex: # If existing entry conflicts with new one (same index, different terms) # Delete the existing entry and all that follow it. if self.log[rpc.prevLogIndex][1] != rpc.prevLogTerm: self.log.truncate(rpc.prevLogIndex) if self.log.lastApplied > rpc.prevLogIndex: # Otherwise this could be a message that is sent again # raise RaftRPCException( # "{} is possibly receiving a duplicate append entries!".format(self) # ) self.sim.logger.warn( "{} is possibly receiving a duplicate append entries!".format(self) ) return self.send(msg.source, AEResponse(self.currentTerm, True, self.log.lastApplied, self.log.lastCommit)) # Append any new entries not already in the log. for entry in rpc.entries: # Add the entry/term to the log self.log.append(*entry) self.sim.logger.debug( "appending {} to {} on {}".format(entry[0], entry[1], self) ) # Update the versions to compute visibilities entry[0].update(self) # Log the last write from the append entries. self.sim.logger.debug( "{} writes {} at idx {} (term {}, commit {})".format( self, self.log.lastVersion, self.log.lastApplied, self.log.lastTerm, self.log.commitIndex )) # If leaderCommit > commitIndex, update commit Index if rpc.leaderCommit > self.log.commitIndex: self.log.commitIndex = min(rpc.leaderCommit, self.log.lastApplied) # Return success response. return self.send(msg.source, AEResponse(self.currentTerm, True, self.log.lastApplied, self.log.lastCommit)) def on_ae_response_rpc(self, msg): """ Handles acknowledgment of append entries message. """ rpc = msg.value if self.state == State.LEADER: if rpc.success: self.nextIndex[msg.source] = rpc.lastLogIndex + 1 self.matchIndex[msg.source] = rpc.lastLogIndex else: # Decrement next index and retry append entries # Ensure to floor the nextIndex to 1 (the start of the log). nidx = self.nextIndex[msg.source] - 1 self.nextIndex[msg.source] = max(nidx, 1) self.send_append_entries(msg.source) # Decide if we can commit the entry for n in xrange(self.log.lastApplied, self.log.commitIndex, -1): commit = Election(self.matchIndex.keys()) for k, v in self.matchIndex.iteritems(): commit.vote(k, v >= n) if commit.has_passed() and self.log[n][1] == self.currentTerm: # Commit all versions from the last log entry to now. for idx in xrange(self.log.commitIndex, n+1): if self.log[idx][0] is None: continue forte = True if settings.simulation.forte_on_commit else False self.log[idx][0].update(self, commit=True, forte=forte) # Set the commit index and break self.log.commitIndex = n break elif self.state == State.CANDIDATE: # Decide whether or not to step down. if rpc.term >= self.currentTerm: ## Become a follower self.state = State.FOLLOWER ## Log the failed election self.sim.logger.info( "{} has stepped down as candidate".format(self) ) return elif self.state == State.FOLLOWER: # Ignore AE messages if we are the follower. return else: raise RaftRPCException( "Append entries response in unknown state: '{}'".format(self.state) ) def on_remote_write_rpc(self, message): """ Unpacks the version from the remote write and initiates a local write. """ # Write the access from the remote replica access = message.value.version self.write(access) # Check if the access was dropped (e.g. the write failed) success = not access.is_dropped() # Send the write response self.send(message.source, WriteResponse(self.currentTerm, success, access)) def on_write_response_rpc(self, message): """ Completes the write if the remote write was successful. """ rpc = message.value if rpc.success: rpc.access.complete()