Example #1
0
class TagReplica(ConsensusReplica):

    def __init__(self, simulation, **kwargs):
        ## Timers for work
        self.session_timeout    = kwargs.get('session_timeout', SESSION_TIMEOUT)
        self.heartbeat_interval = kwargs.get('heartbeat_interval', HEARTBEAT_INTERVAL)
        self.session   = None
        self.heartbeat = None

        ## Initialze the tag specific settings
        self.epoch  = 0
        self.log    = defaultdict(WriteLog)
        self.view   = defaultdict(set)

        ## Owner state
        self.nextIndex  = None
        self.matchIndex = None

        ## Initialize the replica
        super(TagReplica, self).__init__(simulation, **kwargs)
        self.state  = State.READY

    ######################################################################
    ## Core Methods (Replica API)
    ######################################################################

    def read(self, name, **kwargs):
        """
        When a tag replica performs a read it has to decide whether or not to
        read locally or to make a remote read across the cluster.

        Convert the read into an access, then check if we own the object.
        If we do, then return the latest commit.
        If we don't and no one else does either, attempt to acquire the tag.
        If we don't and someone else does then either drop, wait, or remote.

        Current implementation: #2, MR, no remote access.
        If someone else owns tag, reads are dropped.

        TODO: Remote vs Local Reads
        """
        # Create the read event using super.
        access = super(TagReplica, self).read(name, **kwargs)

        # Record the number of attempts for the access
        if access.is_local_to(self): access.attempts += 1

        # Increase the session on access.
        self.handle_session()

        # Are we the owner of this tag?
        if self.owns(access.name):
            # TODO: Change to last commit!
            version = self.log[access.name].lastVersion

            # If the version is None, bail since we haven't read anything
            if version is None: return access.drop(empty=True)

            # Update the version, complete the read, and log the access
            access.update(version, completed=True)
            access.log(self)

            # Return, we're done reading!
            return access

        # Is there a different owner for the tag?
        owner = self.find_owner(access.name)
        if owner is not None:
            # Right now just drop the read on its face.
            self.sim.logger.info(
                "ownership conflict: dropped {} at {}".format(access, self)
            )
            return access.drop()

        # We're going to acquire the tag!
        else:
            # Log the access from this particular replica.
            access.log(self)

            # We're going to have some read latency, retry the read.
            retry = Timer(
                self.env, self.heartbeat_interval, lambda: self.read(access)
            ).start()

            if access.attempts <= 1 and self.state != State.TAGGING:
                # Request the ownership of the tag
                self.acquire(access.name)

        return access

    def write(self, name, **kwargs):
        """
        When a replica performs a write it needs to decide if it can write to
        the tag locally, can acquire a tag for this object, or if it has to do
        something else like drop, wait, or remote write.

        If the access is local:

            - if the replica owns the tag, append and complete
            - if someone else owns the tag then drop, wait, or remote
            - if no one owns the tag, then attempt to acquire it

        If access is remote:

            - if we own the tag, then append but do not complete (at local)
            - if someone else owns the tag, log and forward to owner
            - if no one owns the tag then respond false
        """
        # Create the read event using super.
        access = super(TagReplica, self).write(name, **kwargs)

        # Increase the session on access.
        self.handle_session()

        # Determine if the write is local or remote
        if access.is_local_to(self):
            # Record the number of attempts for the access
            access.attempts += 1

            # Fetch the latest version from the log.
            latest = self.log[access.name].lastVersion

            # Perform the write
            if latest is None:
                version = namespace(access.name)(self)
            else:
                version = latest.nextv(self)

            # Update the access with the latest version
            access.update(version)

        else:
            # If there is no version, raise an exception
            if access.version is None:
                raise AccessError(
                    "Attempting a remote write on {} without a version!".format(self)
                )

            # Save the version variable for use below.
            version = access.version

        # Log the access at this replica
        access.log(self)

        # Are we the owner of this tag?
        if self.owns(access.name):
            # Perform the append entries
            self.log[name].append(version, self.epoch)
            # Update the version to track visibility latency
            version.update(self)

            # Complete the access if it was local
            if access.is_local_to(self): access.complete()

            # Now do AppendEntries
            # Also interrupt the heartbeat since we just sent AppendEntries
            if not settings.simulation.aggregate_writes:
                self.send_append_entries()
                if self.heartbeat: self.heartbeat.stop()

            return access

        # Is there a different owner for the tag?
        owner = self.find_owner(name)
        if owner is not None:
            # Right now just drop the write on its face.
            self.sim.logger.info(
                "ownership conflict: dropped {} at {}".format(access, self)
            )
            return access.drop()

        # We're going to acquire the tag!
        else:
            # We're going to have some write latency, retry the write.
            retry = Timer(
                self.env, self.heartbeat_interval, lambda: self.write(access)
            ).start()

            # Request the ownership of the tag
            self.acquire(access.name)

        return access

    def run(self):
        """
        We have to check in at every heartbeat interval. If we own a tag then
        send a heartbeat message, otherwise just keep quiescing.
        """
        while True:
            if self.state == State.OWNER:
                self.heartbeat = Timer(
                    self.env, self.heartbeat_interval, self.on_heartbeat_timeout
                )
                yield self.heartbeat.start()
            else:
                yield self.env.timeout(self.heartbeat_interval)

    ######################################################################
    ## Helper Methods
    ######################################################################

    def owns(self, name):
        """
        Returns True if the name is in the current view for that owner.
        """
        return name in self.view[self]

    def find_owner(self, name):
        """
        Looks up the owner of the name in the current view.
        Returns None if there is no owner fo the tag.
        """
        for owner, tag in self.view.items():
            if name in tag:
                return owner
        return None

    def acquire(self, tag):
        """
        Sends out the acquire tag RPC
        """
        # Construct the tag to send out
        if not isinstance(tag, (set, frozenset)):
            tag = frozenset([tag])

        # Make sure to request the tag we already have
        tag = frozenset(self.view[self] | tag)

        # Request tag with all current tags
        self.send_tag_request(tag)

        # Log the tag acquisition
        self.sim.logger.info(
            "{} is atempting to acquire tag {}".format(self, self.tag)
        )

    def release(self, tag=None):
        """
        Sends out the release tag RPC
        """
        # Release all currently held tags
        if tag is None: tag = self.view[self]

        # Construct the tag to send out (if specified)
        if not isinstance(tag, (set, frozenset)):
            tag = frozenset([tag])

        # Request the difference of the tags we already have
        tag = frozenset(self.view[self] - tag)

        # Request tag with all current tags
        self.send_tag_request(tag)

        # Log the tag release
        self.sim.logger.info(
            "{} is atempting to release tag {}".format(self, tag)
        )

    def handle_session(self):
        """
        Starts a session timer if one isn't running, otherwise resets the
        currently running session timer on an additional access.
        """
        if not self.session:
            self.session = Timer(
                self.env, self.session_timeout,
                partial(self.on_session_timeout, self.env.now)
            )
        else:
            self.session = self.session.reset()

    def get_log_state(self, tag=None):
        """
        Constructs a log state object for append entries responses, either
        for the current tag or simply the current view.
        """
        if tag is None:
            tag = [obj for view in self.view.values() for obj in view]

        return {
            obj: LogState(
                self.log[obj].lastApplied,
                self.log[obj].lastTerm,
                self.log[obj].commitIndex
            ) for obj in tag
        }

    def send_tag_request(self, tag):
        """
        Broadcasts a tag request for the passed in tag.
        """
        # Change state to tagging and save tag locally
        self.state = State.TAGGING
        self.tag = tag

        # Request the entire tag in your current view.
        tagset = {
            owner.id: tagset
            for owner, tagset in self.view.items()
        }
        tagset[self.id] = self.tag

        # Send the tag request RPC to each neighbor
        rpc = RequestTag(self.epoch, tagset, self)
        for neighbor in self.neighbors():
            self.send(neighbor, rpc)

    def send_append_entries(self, target=None):
        """
        Helper function to send append entries to quorum or a specific node.

        Note: fails silently if target is not in the neighbors list.
        """
        # ownership check
        if not self.state == State.OWNER:
            return

        # Go through follower list.
        for node, objs in self.nextIndex.iteritems():
            # Filter based on the target supplied.
            if target is not None and node != target:
                continue

            # Construct the entries, or empty for heartbeat
            # The tag contains the state of each item to be sent
            entries = defaultdict(list)
            tag = defaultdict(LogState)

            for obj, nidx in objs.items():
                # A rule directly from the Raft paper
                if self.log[obj].lastApplied >= nidx:
                    entries[obj] = self.log[obj][nidx:]

                # Compute the previous log index and term
                prevLogIndex = nidx - 1
                prevLogTerm  = self.log[obj][prevLogIndex].term
                commitIndex  = self.log[obj].commitIndex

                # Create the tag state
                tag[obj] = LogState(prevLogIndex, prevLogTerm, commitIndex)

            # Send the append entries message
            self.send(
                node, AppendEntries(
                    self.epoch, self.id, tag, entries
                )
            )

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_state_change(self):
        """
        Setting the state decides how the Tag node will interact.
        """

        # Do state specific tag modifications
        if self.state == State.READY:
            self.votes = None
            self.tag   = None

            # Remove owner state
            self.nextIndex  = None
            self.matchIndex = None

            # Also interrupt the heartbeat
            if self.heartbeat: self.heartbeat.stop()

        elif self.state == State.TAGGING:
            # Convert to tag acquisition/release
            self.epoch += 1

            # Create election and vote for self
            self.votes = Election([node.id for node in self.quorum()])
            self.votes.vote(self.id)

            # Also interrupt the heartbeat
            if self.heartbeat: self.heartbeat.stop()

        elif self.state == State.OWNER:

            # Create the next index and match index
            self.nextIndex = {
                node: {
                    obj: self.log[obj].lastApplied + 1
                    for obj in self.view[self]
                } for node in self.neighbors()
            }

            self.matchIndex = {
                node: {
                    obj: 0 for obj in self.view[self]
                } for node in self.neighbors()
            }

        else:
            raise SimulationException(
                "Unknown Tag Replica State: {!r} set on {}".format(state, self)
            )

    def on_heartbeat_timeout(self):
        """
        Time to send a heartbeat message to all tags.
        """
        if not self.state == State.OWNER:
            return

        # Send heartbeat or aggregated writes
        self.send_append_entries()

    def on_session_timeout(self, started):
        """
        If the session times out then go ahead and release the tag.
        """
        duration = self.env.now - started

        self.sim.logger.info(
            "session on {} terminated at {} ({} ms)".format(
                self.id, self.env.now, duration
            )
        )

        self.sim.results.update(
            'session length', (self.id, duration)
        )

        self.session = None
        self.release()

    def on_request_tag_rpc(self, msg):
        """
        Respond to a request for a tag acquisition from a server.
        """
        rpc = msg.value
        accept = True

        # The requested epoch must be less than or greater than local.
        if rpc.epoch < self.epoch: accept = False

        # Ensure that no one else owns the tag in your current view.
        for candidate, tagset in rpc.tag.items():
            # Short circuit
            if not accept: break

            for tag in tagset:
                owner = self.find_owner(tag)
                if owner is not None and owner.id != candidate:
                    accept = False
                    break

        # Log the vote decision
        amsg = "accepted" if accept else "did not accept"
        lmsg = "{} {} tag [{}] for {}".format(
            self, amsg, ",".join(rpc.tag[rpc.candidate.id]), rpc.candidate.id
        )
        self.sim.logger.info(lmsg)

        # Send the vote response back to the tag requester
        return self.send(
            msg.source, TagResponse(self.epoch, accept)
        )

    def on_tag_response_rpc(self, msg):
        """
        Handle the votes from tag requests to other nodes.
        """
        rpc = msg.value

        if self.state == State.TAGGING:
            # If the epoch is greater than the current epoch
            if rpc.epoch > self.epoch:
                # Retry the tag request
                self.epoch = rpc.epoch
                self.send_tag_request(self.tag)

                self.sim.logger.info(
                    "{} retrying tag request for {}".format(self, self.tag)
                )

                # Exit: no more work required!
                return

            # Update the current election
            self.votes.vote(msg.source.id, rpc.accept)
            if self.votes.has_passed():

                # Update our local tag and become owner.
                if self.tag:
                    self.state = State.OWNER
                    self.view[self] = set(self.tag)
                else:
                    self.state = State.READY

                # Send out the ownership change append entries
                self.send_append_entries()

                # Log the new tag owner
                self.sim.logger.info(
                    "{} tag goes to: {}".format(self, self.view[self])
                )

                # Record tag length over time
                self.sim.results.update(
                    'tag size', (self.id, self.env.now, len(self.view[self]))
                )

        elif self.state in (State.READY, State.OWNER):
            # Ignore vote responses if we've changed our state
            return

        else:
            raise TagRPCException(
                "Tag request response in unknown state: '{}'".format(self.state)
            )

    def on_append_entries_rpc(self, msg):
        rpc = msg.value

        # reply false if the epoch < current epoch
        if rpc.epoch < self.epoch:
            self.sim.logger.info(
                "{} doesn't accept append entries in epoch {} for epoch {}".format(
                    self, self.epoch, rpc.epoch
                )
            )

            # Send back the request that you made originally.
            return self.send(
                msg.source, AEResponse(
                    self.epoch,
                    {obj: False for obj in rpc.tag.keys()},
                    rpc.tag, Reason.EPOCH
                )
            )

        # Update the view to match the view of the append entries
        # Update the epoch to match the rpc of the append entries
        self.view[msg.source] = set(rpc.tag.keys())
        if self.epoch < rpc.epoch:
            self.epoch = rpc.epoch

        # Now for each object in the RPC, perform Raft-like append entries.
        # The success tracking is a complete tracking for all objects, will
        # return false even if we need to update the log for only one thing.
        # We will reply back with a state object that has per-object details.
        success = defaultdict(bool)
        state   = defaultdict(LogState)

        for obj, prev in rpc.tag.items():
            entries = rpc.entries[obj]
            objlog  = self.log[obj]

            # If log doesn't contain an entry at prev index matching epoch.
            if objlog.lastApplied < prev.index or objlog[prev.index].term != prev.epoch:

                # Perform the logging of this state failure
                if objlog.lastApplied < prev.index:
                    self.sim.logger.info(
                        "{} doesn't accept append to {} index {} where last applied is {}".format(
                            self, obj, prev.index, objlog.lastApplied
                        )
                    )
                else:
                    self.sim.logger.info(
                        "{} doesn't accept append to {} due to epoch mismatch: {} vs {}".format(
                            self, obj, prev.epoch, objlog[prev.index].term
                        )
                    )

                # Mark that there is a problem and continue
                success[obj] = False
                state[obj] = LogState(objlog.lastApplied, objlog.lastTerm, objlog.lastCommit)
                continue

            # At this point the entries are accepted because of continue statements
            if entries:
                if objlog.lastApplied >= prev.index:
                    # If existing entry conflicts with a new one (same index, different epochs)
                    # Delete the existing entry and all that follow it.
                    if objlog[prev.index].term != prev.epoch:
                        objlog.truncate(prev.index)

                if objlog.lastApplied > prev.index:
                    # Better look into what's happening here!
                    raise TagRPCException(
                        "{} is possibly receiving duplicate append entries".format(self)
                    )

                # Append any new entries not already in the log.
                for entry in entries:
                    # Add the entry/epoch to the log
                    objlog.append(*entry)

                    # Update the versions to compute visibilities
                    entry[0].update(self)

                # Log the last write from the append entries
                self.sim.logger.debug(
                    "appending {} entries to {} log on {} (term {}, commit {})".format(
                        len(entries), obj, self, objlog.lastTerm, objlog.commitIndex
                    )
                )

            # Update the commit index and save the state of the object.
            if prev.commit > objlog.commitIndex:
                objlog.commitIndex = min(prev.commit, objlog.lastApplied)

            success[obj] = True
            state[obj] = LogState(objlog.lastApplied, objlog.lastTerm, objlog.lastCommit)

        # Return the response back to the owner
        reason = Reason.OK if all(success.values()) else Reason.LOG
        return self.send(
            msg.source, AEResponse(self.epoch, success, state, reason)
        )

    def on_ae_response_rpc(self, msg):
        """
        Handles acknowledgment of append entries messages.
        """
        rpc = msg.value
        retry = False

        if self.state == State.OWNER:

            # Update state of followers in the tag group
            for obj, success in rpc.success.items():
                if success:
                    self.nextIndex[msg.source][obj] = rpc.tag[obj].index + 1
                    self.matchIndex[msg.source][obj] = rpc.tag[obj].index

                else:
                    # If the epoch is not the same, update accordingly.
                    if rpc.epoch > self.epoch:
                        self.epoch = rpc.epoch

                    # If the failure was because of the epoch, simply retry.
                    if rpc.reason == Reason.EPOCH:
                        retry = True

                    # Otherwise decrement the next index and to retry
                    elif rpc.reason == Reason.LOG:
                        self.nextIndex[msg.source][obj] -= 1
                        retry = True

                    else:
                        raise TagRPCException(
                            "Unknown append entries failure reason: {}".format(rpc.reason)
                        )

            # Determine if we can commit the entry
            for obj, state in rpc.tag.items():
                log = self.log[obj]
                for n in xrange(log.lastApplied, log.commitIndex, -1):
                    commit = Election(self.matchIndex.keys())
                    for node, objs in self.matchIndex.items():
                        match = objs[obj]
                        commit.vote(node, match >= n)

                    if commit.has_passed() and log[n].term == self.epoch:
                        # Commit all versions from the last log to now.
                        for idx in xrange(log.commitIndex, n+1):
                            if not log[idx].version: continue
                            log[idx].version.update(self, commit=True)

                        # Set the commit index and break
                        log.commitIndex = n
                        break

            # If retry, send append entries back to the source.
            if retry: self.send_append_entries(msg.source)


        elif self.state == State.TAGGING:
            # Determine if we need to retry the tagging again.
            if rpc.epoch > self.epoch:
                # Retry the tag request
                self.epoch = rpc.epoch
                self.send_tag_request(self.tag)

                self.sim.logger.info(
                    "{} retrying tag request for {}".format(self, self.tag)
                )

                return

        elif self.state == State.READY:
            # Ignore AE messages if we're not an owner anymore.
            return

        else:
            raise TagRPCException(
                "Response in unknown state: '{}'".format(self.state)
            )

    def on_remote_access(self, msg):
        """
        Handles remote writes to and from the replicas.
        """
        access = msg.value.access

        # Ensure that we own the object
        if not self.owns(access.name):
            return self.send(
                msg.source, AccessResponse(self.epoch, False, access)
            )

        # If we do own the object, then respond:
        method = {
            'read': self.read,
            'write': self.write,
        }[access.type]

        # Call the remote method with the access.
        method(access)

        return self.send(
            msg.source, AccessResponse(self.epoch, True, access)
        )

    def on_access_response_rpc(self, msg):
        """
        Handles responses to remote accesses.
        """
        rpc = msg.value
        if rpc.success:
            rpc.access.complete()
Example #2
0
class FloatedRaftReplica(RaftReplica):

    def __init__(self, simulation, **kwargs):
        ## Initialize the replica
        super(FloatedRaftReplica, self).__init__(simulation, **kwargs)

        # Anti entropy settings
        self.ae_delay = kwargs.get('anti_entropy_delay', ANTI_ENTROPY_DELAY)
        self.ae_timer = None
        self.ae_cache = []

    @memoized
    def locations(self):
        """
        Returns all the locations in the network with Raft nodes.
        """
        return set([
            node.location for node in self.neighbors(self.consistency)
        ])

    def quorum(self):
        """
        Returns only nodes in the same location to do Raft consensus with.
        """

        # Filter only connections that are in the same consistency group
        for node in self.neighbors(self.consistency):
            if node.location == self.location:
                yield node

        # Don't forget to yield self!
        yield self

    def remotes(self, location=None):
        """
        Returns only nodes that are not in the same location to float writes
        to using anti-entropy. This method is only used by the leader.
        Can also specify a specific location to fetch the remotes for. Note
        that specifying your current location will not return nodes.
        """

        # Filter only connections that are in the same consistency group
        for node in self.neighbors(self.consistency):
            if node.location != self.location:
                if location is not None and node.location != location:
                    continue
                yield node

    def gossip(self):
        """
        Randomly select a neighbor and exchange information about the state
        of the latest entries in the log since the last anti-entropy delay.
        """

        # Gossip to one node at each location
        for location in self.locations:
            # Don't gossip to nodes in self!
            if location == self.location: continue

            # Select a random target to gossip to
            target = random.choice(list(self.remotes(location)))

            # Log the gossip that's happening
            self.sim.logger.debug(
                "{} gossiping {} entries to {}".format(
                    self, len(self.ae_cache), target
                )
            )

            entries = tuple([
                Write(version.name, self, version)
                for version in self.ae_cache
            ])

            # Send all the values in the cache.
            self.send(target, Gossip(entries, len(self.ae_cache), -1))

        # Empty the cache on gossip
        self.ae_cache = []

        # Reset the anti-entropy timer
        self.ae_timer = Timer(self.env, self.ae_delay, self.gossip)
        self.ae_timer.start()

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_state_change(self):
        """
        Does the same stuff as super, but also - if leader; starts the anti
        entropy interval to do gossiping.
        """
        super(FloatedRaftReplica, self).on_state_change()

        if self.state in (State.FOLLOWER, State.CANDIDATE):
            if hasattr(self, 'ae_timer') and self.ae_timer is not None:
                # Cancel the anti-entropy timer.
                self.ae_timer.stop()
                self.ae_timer = None
        elif self.state == State.LEADER:
            self.ae_timer = Timer(self.env, self.ae_delay, self.gossip)
            self.ae_timer.start()
        elif self.state == State.READY:
            # This happens on the call to super, just ignore for now.
            pass
        else:
            raise SimulationException(
                "Unknown Floating Raft State: {!r} set on {}".format(self.state, self)
            )

    def on_gossip_rpc(self, message):
        """
        Handles the receipt of a gossip from another node. Expects multiple
        accesses (Write events) as entries. Goes through all and compares the
        versions, replying False only if there is an error or a conflict.
        """
        entries = message.value.entries

        # Go through the entries from the RPC and write to local cluster.
        for access in entries:
            access.version.gossiped = True
            self.write(access)

        # Should we return with what's in our cache?
        # Respond to the sender
        self.send(message.source, GossipResponse([], 0, True, -1))

    def on_response_rpc(self, message):
        """
        Just receives the acknowledgment of the response.
        """
        pass

    def on_ae_response_rpc(self, msg):
        """
        Does the same stuff that the super handler does, but also caches
        commits to gossip about them later!
        """
        rpc = msg.value

        if self.state == State.LEADER:

            if rpc.success:
                self.nextIndex[msg.source]  = rpc.lastLogIndex + 1
                self.matchIndex[msg.source] = rpc.lastLogIndex

            else:
                # Decrement next index and retry append entries
                self.nextIndex[msg.source] -= 1
                self.send_append_entries(msg.source)

            # Decide if we can commit the entry
            for n in xrange(self.log.lastApplied, self.log.commitIndex, -1):
                commit = Election(self.matchIndex.keys())
                for k, v in self.matchIndex.iteritems():
                    commit.vote(k, v >= n)

                if commit.has_passed() and self.log[n][1] == self.currentTerm:
                    # Commit all versions from the last log entry to now.
                    for idx in xrange(self.log.commitIndex, n+1):
                        if self.log[idx][0] is None: continue

                        # Cache the version to anti-entropy!
                        version = self.log[idx][0]
                        if not hasattr(version, 'gossiped') or not version.gossiped:
                            self.ae_cache.append(version)

                        self.log[idx][0].update(self, commit=True)

                    # Set the commit index and break
                    self.log.commitIndex = n
                    break

        elif self.state == State.CANDIDATE:

            # Decide whether or not to step down.
            if rpc.term >= self.currentTerm:
                ## Become a follower
                self.state = State.FOLLOWER

                ## Log the failed election
                self.sim.logger.info(
                    "{} has stepped down as candidate".format(self)
                )

                return

        elif self.state == State.FOLLOWER:
            # Ignore AE messages if we are the follower.
            return

        else:
            raise RaftRPCException(
                "Append entries response in unknown state: '{}'".format(self.state)
            )
Example #3
0
class TagReplica(ConsensusReplica):
    def __init__(self, simulation, **kwargs):
        ## Timers for work
        self.session_timeout = kwargs.get('session_timeout', SESSION_TIMEOUT)
        self.heartbeat_interval = kwargs.get('heartbeat_interval',
                                             HEARTBEAT_INTERVAL)
        self.session = None
        self.heartbeat = None

        ## Initialze the tag specific settings
        self.epoch = 0
        self.log = defaultdict(WriteLog)
        self.view = defaultdict(set)

        ## Owner state
        self.nextIndex = None
        self.matchIndex = None

        ## Initialize the replica
        super(TagReplica, self).__init__(simulation, **kwargs)
        self.state = State.READY

    ######################################################################
    ## Core Methods (Replica API)
    ######################################################################

    def read(self, name, **kwargs):
        """
        When a tag replica performs a read it has to decide whether or not to
        read locally or to make a remote read across the cluster.

        Convert the read into an access, then check if we own the object.
        If we do, then return the latest commit.
        If we don't and no one else does either, attempt to acquire the tag.
        If we don't and someone else does then either drop, wait, or remote.

        Current implementation: #2, MR, no remote access.
        If someone else owns tag, reads are dropped.

        TODO: Remote vs Local Reads
        """
        # Create the read event using super.
        access = super(TagReplica, self).read(name, **kwargs)

        # Record the number of attempts for the access
        if access.is_local_to(self): access.attempts += 1

        # Increase the session on access.
        self.handle_session()

        # Are we the owner of this tag?
        if self.owns(access.name):
            # TODO: Change to last commit!
            version = self.log[access.name].lastVersion

            # If the version is None, bail since we haven't read anything
            if version is None: return access.drop(empty=True)

            # Update the version, complete the read, and log the access
            access.update(version, completed=True)
            access.log(self)

            # Return, we're done reading!
            return access

        # Is there a different owner for the tag?
        owner = self.find_owner(access.name)
        if owner is not None:
            # Right now just drop the read on its face.
            self.sim.logger.info("ownership conflict: dropped {} at {}".format(
                access, self))
            return access.drop()

        # We're going to acquire the tag!
        else:
            # Log the access from this particular replica.
            access.log(self)

            # We're going to have some read latency, retry the read.
            retry = Timer(self.env, self.heartbeat_interval,
                          lambda: self.read(access)).start()

            if access.attempts <= 1 and self.state != State.TAGGING:
                # Request the ownership of the tag
                self.acquire(access.name)

        return access

    def write(self, name, **kwargs):
        """
        When a replica performs a write it needs to decide if it can write to
        the tag locally, can acquire a tag for this object, or if it has to do
        something else like drop, wait, or remote write.

        If the access is local:

            - if the replica owns the tag, append and complete
            - if someone else owns the tag then drop, wait, or remote
            - if no one owns the tag, then attempt to acquire it

        If access is remote:

            - if we own the tag, then append but do not complete (at local)
            - if someone else owns the tag, log and forward to owner
            - if no one owns the tag then respond false
        """
        # Create the read event using super.
        access = super(TagReplica, self).write(name, **kwargs)

        # Increase the session on access.
        self.handle_session()

        # Determine if the write is local or remote
        if access.is_local_to(self):
            # Record the number of attempts for the access
            access.attempts += 1

            # Fetch the latest version from the log.
            latest = self.log[access.name].lastVersion

            # Perform the write
            if latest is None:
                version = namespace(access.name)(self)
            else:
                version = latest.nextv(self)

            # Update the access with the latest version
            access.update(version)

        else:
            # If there is no version, raise an exception
            if access.version is None:
                raise AccessError(
                    "Attempting a remote write on {} without a version!".
                    format(self))

            # Save the version variable for use below.
            version = access.version

        # Log the access at this replica
        access.log(self)

        # Are we the owner of this tag?
        if self.owns(access.name):
            # Perform the append entries
            self.log[name].append(version, self.epoch)
            # Update the version to track visibility latency
            version.update(self)

            # Complete the access if it was local
            if access.is_local_to(self): access.complete()

            # Now do AppendEntries
            # Also interrupt the heartbeat since we just sent AppendEntries
            if not settings.simulation.aggregate_writes:
                self.send_append_entries()
                if self.heartbeat: self.heartbeat.stop()

            return access

        # Is there a different owner for the tag?
        owner = self.find_owner(name)
        if owner is not None:
            # Right now just drop the write on its face.
            self.sim.logger.info("ownership conflict: dropped {} at {}".format(
                access, self))
            return access.drop()

        # We're going to acquire the tag!
        else:
            # We're going to have some write latency, retry the write.
            retry = Timer(self.env, self.heartbeat_interval,
                          lambda: self.write(access)).start()

            # Request the ownership of the tag
            self.acquire(access.name)

        return access

    def run(self):
        """
        We have to check in at every heartbeat interval. If we own a tag then
        send a heartbeat message, otherwise just keep quiescing.
        """
        while True:
            if self.state == State.OWNER:
                self.heartbeat = Timer(self.env, self.heartbeat_interval,
                                       self.on_heartbeat_timeout)
                yield self.heartbeat.start()
            else:
                yield self.env.timeout(self.heartbeat_interval)

    ######################################################################
    ## Helper Methods
    ######################################################################

    def owns(self, name):
        """
        Returns True if the name is in the current view for that owner.
        """
        return name in self.view[self]

    def find_owner(self, name):
        """
        Looks up the owner of the name in the current view.
        Returns None if there is no owner fo the tag.
        """
        for owner, tag in self.view.items():
            if name in tag:
                return owner
        return None

    def acquire(self, tag):
        """
        Sends out the acquire tag RPC
        """
        # Construct the tag to send out
        if not isinstance(tag, (set, frozenset)):
            tag = frozenset([tag])

        # Make sure to request the tag we already have
        tag = frozenset(self.view[self] | tag)

        # Request tag with all current tags
        self.send_tag_request(tag)

        # Log the tag acquisition
        self.sim.logger.info("{} is atempting to acquire tag {}".format(
            self, self.tag))

    def release(self, tag=None):
        """
        Sends out the release tag RPC
        """
        # Release all currently held tags
        if tag is None: tag = self.view[self]

        # Construct the tag to send out (if specified)
        if not isinstance(tag, (set, frozenset)):
            tag = frozenset([tag])

        # Request the difference of the tags we already have
        tag = frozenset(self.view[self] - tag)

        # Request tag with all current tags
        self.send_tag_request(tag)

        # Log the tag release
        self.sim.logger.info("{} is atempting to release tag {}".format(
            self, tag))

    def handle_session(self):
        """
        Starts a session timer if one isn't running, otherwise resets the
        currently running session timer on an additional access.
        """
        if not self.session:
            self.session = Timer(
                self.env, self.session_timeout,
                partial(self.on_session_timeout, self.env.now))
        else:
            self.session = self.session.reset()

    def get_log_state(self, tag=None):
        """
        Constructs a log state object for append entries responses, either
        for the current tag or simply the current view.
        """
        if tag is None:
            tag = [obj for view in self.view.values() for obj in view]

        return {
            obj: LogState(self.log[obj].lastApplied, self.log[obj].lastTerm,
                          self.log[obj].commitIndex)
            for obj in tag
        }

    def send_tag_request(self, tag):
        """
        Broadcasts a tag request for the passed in tag.
        """
        # Change state to tagging and save tag locally
        self.state = State.TAGGING
        self.tag = tag

        # Request the entire tag in your current view.
        tagset = {owner.id: tagset for owner, tagset in self.view.items()}
        tagset[self.id] = self.tag

        # Send the tag request RPC to each neighbor
        rpc = RequestTag(self.epoch, tagset, self)
        for neighbor in self.neighbors():
            self.send(neighbor, rpc)

    def send_append_entries(self, target=None):
        """
        Helper function to send append entries to quorum or a specific node.

        Note: fails silently if target is not in the neighbors list.
        """
        # ownership check
        if not self.state == State.OWNER:
            return

        # Go through follower list.
        for node, objs in self.nextIndex.iteritems():
            # Filter based on the target supplied.
            if target is not None and node != target:
                continue

            # Construct the entries, or empty for heartbeat
            # The tag contains the state of each item to be sent
            entries = defaultdict(list)
            tag = defaultdict(LogState)

            for obj, nidx in objs.items():
                # A rule directly from the Raft paper
                if self.log[obj].lastApplied >= nidx:
                    entries[obj] = self.log[obj][nidx:]

                # Compute the previous log index and term
                prevLogIndex = nidx - 1
                prevLogTerm = self.log[obj][prevLogIndex].term
                commitIndex = self.log[obj].commitIndex

                # Create the tag state
                tag[obj] = LogState(prevLogIndex, prevLogTerm, commitIndex)

            # Send the append entries message
            self.send(node, AppendEntries(self.epoch, self.id, tag, entries))

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_state_change(self):
        """
        Setting the state decides how the Tag node will interact.
        """

        # Do state specific tag modifications
        if self.state == State.READY:
            self.votes = None
            self.tag = None

            # Remove owner state
            self.nextIndex = None
            self.matchIndex = None

            # Also interrupt the heartbeat
            if self.heartbeat: self.heartbeat.stop()

        elif self.state == State.TAGGING:
            # Convert to tag acquisition/release
            self.epoch += 1

            # Create election and vote for self
            self.votes = Election([node.id for node in self.quorum()])
            self.votes.vote(self.id)

            # Also interrupt the heartbeat
            if self.heartbeat: self.heartbeat.stop()

        elif self.state == State.OWNER:

            # Create the next index and match index
            self.nextIndex = {
                node: {
                    obj: self.log[obj].lastApplied + 1
                    for obj in self.view[self]
                }
                for node in self.neighbors()
            }

            self.matchIndex = {
                node: {obj: 0
                       for obj in self.view[self]}
                for node in self.neighbors()
            }

        else:
            raise SimulationException(
                "Unknown Tag Replica State: {!r} set on {}".format(
                    state, self))

    def on_heartbeat_timeout(self):
        """
        Time to send a heartbeat message to all tags.
        """
        if not self.state == State.OWNER:
            return

        # Send heartbeat or aggregated writes
        self.send_append_entries()

    def on_session_timeout(self, started):
        """
        If the session times out then go ahead and release the tag.
        """
        duration = self.env.now - started

        self.sim.logger.info("session on {} terminated at {} ({} ms)".format(
            self.id, self.env.now, duration))

        self.sim.results.update('session length', (self.id, duration))

        self.session = None
        self.release()

    def on_request_tag_rpc(self, msg):
        """
        Respond to a request for a tag acquisition from a server.
        """
        rpc = msg.value
        accept = True

        # The requested epoch must be less than or greater than local.
        if rpc.epoch < self.epoch: accept = False

        # Ensure that no one else owns the tag in your current view.
        for candidate, tagset in rpc.tag.items():
            # Short circuit
            if not accept: break

            for tag in tagset:
                owner = self.find_owner(tag)
                if owner is not None and owner.id != candidate:
                    accept = False
                    break

        # Log the vote decision
        amsg = "accepted" if accept else "did not accept"
        lmsg = "{} {} tag [{}] for {}".format(
            self, amsg, ",".join(rpc.tag[rpc.candidate.id]), rpc.candidate.id)
        self.sim.logger.info(lmsg)

        # Send the vote response back to the tag requester
        return self.send(msg.source, TagResponse(self.epoch, accept))

    def on_tag_response_rpc(self, msg):
        """
        Handle the votes from tag requests to other nodes.
        """
        rpc = msg.value

        if self.state == State.TAGGING:
            # If the epoch is greater than the current epoch
            if rpc.epoch > self.epoch:
                # Retry the tag request
                self.epoch = rpc.epoch
                self.send_tag_request(self.tag)

                self.sim.logger.info("{} retrying tag request for {}".format(
                    self, self.tag))

                # Exit: no more work required!
                return

            # Update the current election
            self.votes.vote(msg.source.id, rpc.accept)
            if self.votes.has_passed():

                # Update our local tag and become owner.
                if self.tag:
                    self.state = State.OWNER
                    self.view[self] = set(self.tag)
                else:
                    self.state = State.READY

                # Send out the ownership change append entries
                self.send_append_entries()

                # Log the new tag owner
                self.sim.logger.info("{} tag goes to: {}".format(
                    self, self.view[self]))

                # Record tag length over time
                self.sim.results.update(
                    'tag size', (self.id, self.env.now, len(self.view[self])))

        elif self.state in (State.READY, State.OWNER):
            # Ignore vote responses if we've changed our state
            return

        else:
            raise TagRPCException(
                "Tag request response in unknown state: '{}'".format(
                    self.state))

    def on_append_entries_rpc(self, msg):
        rpc = msg.value

        # reply false if the epoch < current epoch
        if rpc.epoch < self.epoch:
            self.sim.logger.info(
                "{} doesn't accept append entries in epoch {} for epoch {}".
                format(self, self.epoch, rpc.epoch))

            # Send back the request that you made originally.
            return self.send(
                msg.source,
                AEResponse(self.epoch, {obj: False
                                        for obj in rpc.tag.keys()}, rpc.tag,
                           Reason.EPOCH))

        # Update the view to match the view of the append entries
        # Update the epoch to match the rpc of the append entries
        self.view[msg.source] = set(rpc.tag.keys())
        if self.epoch < rpc.epoch:
            self.epoch = rpc.epoch

        # Now for each object in the RPC, perform Raft-like append entries.
        # The success tracking is a complete tracking for all objects, will
        # return false even if we need to update the log for only one thing.
        # We will reply back with a state object that has per-object details.
        success = defaultdict(bool)
        state = defaultdict(LogState)

        for obj, prev in rpc.tag.items():
            entries = rpc.entries[obj]
            objlog = self.log[obj]

            # If log doesn't contain an entry at prev index matching epoch.
            if objlog.lastApplied < prev.index or objlog[
                    prev.index].term != prev.epoch:

                # Perform the logging of this state failure
                if objlog.lastApplied < prev.index:
                    self.sim.logger.info(
                        "{} doesn't accept append to {} index {} where last applied is {}"
                        .format(self, obj, prev.index, objlog.lastApplied))
                else:
                    self.sim.logger.info(
                        "{} doesn't accept append to {} due to epoch mismatch: {} vs {}"
                        .format(self, obj, prev.epoch,
                                objlog[prev.index].term))

                # Mark that there is a problem and continue
                success[obj] = False
                state[obj] = LogState(objlog.lastApplied, objlog.lastTerm,
                                      objlog.lastCommit)
                continue

            # At this point the entries are accepted because of continue statements
            if entries:
                if objlog.lastApplied >= prev.index:
                    # If existing entry conflicts with a new one (same index, different epochs)
                    # Delete the existing entry and all that follow it.
                    if objlog[prev.index].term != prev.epoch:
                        objlog.truncate(prev.index)

                if objlog.lastApplied > prev.index:
                    # Better look into what's happening here!
                    raise TagRPCException(
                        "{} is possibly receiving duplicate append entries".
                        format(self))

                # Append any new entries not already in the log.
                for entry in entries:
                    # Add the entry/epoch to the log
                    objlog.append(*entry)

                    # Update the versions to compute visibilities
                    entry[0].update(self)

                # Log the last write from the append entries
                self.sim.logger.debug(
                    "appending {} entries to {} log on {} (term {}, commit {})"
                    .format(len(entries), obj, self, objlog.lastTerm,
                            objlog.commitIndex))

            # Update the commit index and save the state of the object.
            if prev.commit > objlog.commitIndex:
                objlog.commitIndex = min(prev.commit, objlog.lastApplied)

            success[obj] = True
            state[obj] = LogState(objlog.lastApplied, objlog.lastTerm,
                                  objlog.lastCommit)

        # Return the response back to the owner
        reason = Reason.OK if all(success.values()) else Reason.LOG
        return self.send(msg.source,
                         AEResponse(self.epoch, success, state, reason))

    def on_ae_response_rpc(self, msg):
        """
        Handles acknowledgment of append entries messages.
        """
        rpc = msg.value
        retry = False

        if self.state == State.OWNER:

            # Update state of followers in the tag group
            for obj, success in rpc.success.items():
                if success:
                    self.nextIndex[msg.source][obj] = rpc.tag[obj].index + 1
                    self.matchIndex[msg.source][obj] = rpc.tag[obj].index

                else:
                    # If the epoch is not the same, update accordingly.
                    if rpc.epoch > self.epoch:
                        self.epoch = rpc.epoch

                    # If the failure was because of the epoch, simply retry.
                    if rpc.reason == Reason.EPOCH:
                        retry = True

                    # Otherwise decrement the next index and to retry
                    elif rpc.reason == Reason.LOG:
                        self.nextIndex[msg.source][obj] -= 1
                        retry = True

                    else:
                        raise TagRPCException(
                            "Unknown append entries failure reason: {}".format(
                                rpc.reason))

            # Determine if we can commit the entry
            for obj, state in rpc.tag.items():
                log = self.log[obj]
                for n in xrange(log.lastApplied, log.commitIndex, -1):
                    commit = Election(self.matchIndex.keys())
                    for node, objs in self.matchIndex.items():
                        match = objs[obj]
                        commit.vote(node, match >= n)

                    if commit.has_passed() and log[n].term == self.epoch:
                        # Commit all versions from the last log to now.
                        for idx in xrange(log.commitIndex, n + 1):
                            if not log[idx].version: continue
                            log[idx].version.update(self, commit=True)

                        # Set the commit index and break
                        log.commitIndex = n
                        break

            # If retry, send append entries back to the source.
            if retry: self.send_append_entries(msg.source)

        elif self.state == State.TAGGING:
            # Determine if we need to retry the tagging again.
            if rpc.epoch > self.epoch:
                # Retry the tag request
                self.epoch = rpc.epoch
                self.send_tag_request(self.tag)

                self.sim.logger.info("{} retrying tag request for {}".format(
                    self, self.tag))

                return

        elif self.state == State.READY:
            # Ignore AE messages if we're not an owner anymore.
            return

        else:
            raise TagRPCException("Response in unknown state: '{}'".format(
                self.state))

    def on_remote_access(self, msg):
        """
        Handles remote writes to and from the replicas.
        """
        access = msg.value.access

        # Ensure that we own the object
        if not self.owns(access.name):
            return self.send(msg.source,
                             AccessResponse(self.epoch, False, access))

        # If we do own the object, then respond:
        method = {
            'read': self.read,
            'write': self.write,
        }[access.type]

        # Call the remote method with the access.
        method(access)

        return self.send(msg.source, AccessResponse(self.epoch, True, access))

    def on_access_response_rpc(self, msg):
        """
        Handles responses to remote accesses.
        """
        rpc = msg.value
        if rpc.success:
            rpc.access.complete()
Example #4
0
class RaftReplica(ConsensusReplica):
    def __init__(self, simulation, **kwargs):
        ## Initialize the replica
        super(RaftReplica, self).__init__(simulation, **kwargs)

        ## Initialize Raft Specific settings
        self.state = State.FOLLOWER
        self.currentTerm = 0
        self.votedFor = None
        self.log = MultiObjectWriteLog()
        self.cache = {}

        ## Policies
        self.read_policy = ReadPolicy.get(kwargs.get("read_policy", READ_POLICY))
        self.aggregate_writes = kwargs.get("aggregate_writes", AGGREGATE_WRITES)

        ## Timers for work
        eto = kwargs.get("election_timeout", ELECTION_TIMEOUT)
        hbt = kwargs.get("heartbeat_interval", HEARTBEAT_INTERVAL)

        self.timeout = ElectionTimer.fromReplica(self, eto)
        self.heartbeat = Timer(self.env, hbt, self.on_heartbeat_timeout)

        ## Leader state
        self.nextIndex = None
        self.matchIndex = None

    ######################################################################
    ## Core Methods (Replica API)
    ######################################################################

    def recv(self, event):
        """
        Before dispatching the message to an RPC specific handler, there are
        some message-wide checks that need to occur. In this case the term
        must be inspected and if the replica is behind, become follower.
        """
        message = event.value
        rpc = message.value

        # If RPC request or response contains term > currentTerm
        # Set currentTerm to term and convert to follower.
        if rpc.term > self.currentTerm:
            self.state = State.FOLLOWER
            self.currentTerm = rpc.term

        # Record the received message and dispatch to event handler
        return super(RaftReplica, self).recv(event)

    def read(self, name, **kwargs):
        """
        Raft nodes perform a local read of the most recent commited version
        for the name passed in. Because the committed version could be stale
        (a new version is still waiting for 2 phase commit) a fork is possible
        but the Raft group will maintain full linearizability.
        """
        # Create the read event using super.
        access = super(RaftReplica, self).read(name, **kwargs)

        # Record the number of attempts for the access
        if access.is_local_to(self):
            access.attempts += 1

        # NOTE: Formerly, this was ALWAYS read commit not read latest, now
        # it is set by the read policy on the replica. We previously noted that
        # read committed was one of the key differences from eventual.
        version = self.read_via_policy(access.name)

        # If the version is None, that we haven't read anything!
        if version is None:
            return access.drop(empty=True)

        # Because this is a local read committed, complete the read.
        access.update(version, completed=True)

        # Log the access from this particular replica.
        access.log(self)

        return access

    def write(self, name, **kwargs):
        """
        The write can be initiated on any replica server, including followers.
        Step one is to create the access event using super, which will give us
        the ability to detect local vs. remote writes.

        If the write is local:
        - create a new version from the latest write.
        - if follower: send a RemoteWrite with new version to the leader (write latency)
                store a cache copy so that followers can read their own writes.
                cached copy of the write goes away on AppendEntries.
        - if leader: append to log and complete (no leader latency)

        If the write is remote:
        - if follower: log warning and forward to leader
        - if leader: append to log but do not complete (complete at local)

        Check the committed vs. latest new versions.

        After local vs. remote do the following:

        1. update the version for visibility latency
        2. if leader send append entries
        """
        access = super(RaftReplica, self).write(name, **kwargs)

        # Determine if the write is local or remote
        if access.is_local_to(self):
            # Record the number of attempts for the access
            access.attempts += 1

            # Write a new version to the latest read by policy
            version = self.write_via_policy(access.name)

            # Update the access with the latest version
            access.update(version)

            # Log the access from this particular replica.
            access.log(self)

            if self.state == State.LEADER:
                # Append to log and complete if leader and local
                self.append_via_policy(access, complete=True)

            else:
                # Store the version in the cache and send remote write.
                self.cache[access.name] = version
                return self.send_remote_write(access)

        else:
            # Log the access from this particular replica.
            access.log(self)

            # If there is no version, raise an exception
            if access.version is None:
                raise AccessError("Attempting a remote write on {} without a version!".format(self))

            # Save the version variable for use below.
            version = access.version

            if self.state == State.LEADER:
                # Append to log but do not complete since its remote
                self.append_via_policy(access, complete=False)

            else:
                # Remote write occurred from client to a follower
                self.sim.logger.info("remote write on follower node: {}".format(self))

                # Store the version in the cache and send remote write.
                self.cache[access.name] = version
                return self.send_remote_write(access)

        # At this point we've dealt with local vs. remote, we should be the leader
        assert self.state == State.LEADER

        # Update the version to track visibility latency
        forte = True if settings.simulation.forte_on_append else False
        version.update(self, forte=forte)

        # Now do AppendEntries
        # Also interrupt the heartbeat since we just sent AppendEntries
        if not self.aggregate_writes:
            self.send_append_entries()
            self.heartbeat.stop()

        return access

    def run(self):
        """
        Implements the Raft consensus protocol and elections.
        """
        while True:
            if self.state in {State.FOLLOWER, State.CANDIDATE}:
                yield self.timeout.start()

            elif self.state == State.LEADER:
                yield self.heartbeat.start()

            else:
                raise SimulationException("Unknown Raft State: {!r} on {}".format(self.state, self))

    ######################################################################
    ## Helper Methods
    ######################################################################

    def send_append_entries(self, target=None):
        """
        Helper function to send append entries to quorum or a specific node.

        Note: fails silently if target is not in the neighbors list.
        """
        # Leader check
        if not self.state == State.LEADER:
            return

        # Go through follower list.
        for node, nidx in self.nextIndex.iteritems():
            # Filter based on the target supplied.
            if target is not None and node != target:
                continue

            # Construct the entries, or empty for heartbeat
            entries = []
            if self.log.lastApplied >= nidx:
                entries = self.log[nidx:]

            # Compute the previous log index and term
            prevLogIndex = nidx - 1
            prevLogTerm = self.log[prevLogIndex].term

            # Send the heartbeat message
            self.send(
                node, AppendEntries(self.currentTerm, self.id, prevLogIndex, prevLogTerm, entries, self.log.commitIndex)
            )

    def send_remote_write(self, access):
        """
        Helper function to send a remote write from a follower to leader.
        """
        # Find the leader to perform the remote write.
        leader = self.get_leader_node()

        # If not leader, then drop the write
        if not leader:
            self.sim.logger.info("no leader: dropped write at {}".format(self))

            return access.drop()

        # Send the remote write to the leader
        self.send(leader, RemoteWrite(self.currentTerm, access))

        return access

    def get_leader_node(self):
        """
        Searches for the leader amongst the neighbors. Raises an exception if
        there are multiple leaders, which is an extreme edge case.
        """
        leaders = [node for node in self.quorum() if node.state == State.LEADER]

        if len(leaders) > 1:
            raise SimulationException("MutipleLeaders?!")
        elif len(leaders) < 1:
            return None
        else:
            return leaders[0]

    def read_via_policy(self, name):
        """
        This method returns a version from either the log or the cache
        according to the read policy set on the replica server as follows:

            - COMMIT: return the latest commited version (ignoring cache)
            - LATEST: return latest version in log or in cache

        This method raises an exception on bad read policies.
        """

        # If the policy is read committed, return the latest committed version
        if self.read_policy == ReadPolicy.COMMIT:
            return self.log.get_latest_commit(name)

        # If the policy is latest, read the latest and compare to cache.
        if self.read_policy == ReadPolicy.LATEST:
            # Get the latest version from the log (committed or not)
            version = self.log.get_latest_version(name)

            # If name in the cache and the cache version is greater, return it.
            if name in self.cache and version is not None:
                if self.cache[name] > version:
                    return self.cache[name]

            # Return the latest version
            return version

        # If we've reached this point, we don't know what to do!
        raise SimulationException("Unknown read policy!")

    def write_via_policy(self, name):
        """
        This method returns a new version incremented from either from the
        log or from the cache according to the read policy. It also handles
        any "new" writes, e.g. to objects that haven't been written yet.
        """
        # Fetch the version from the log or the cache according to the
        # read policy. This implements READ COMMITTED/READ LATEST
        latest = self.read_via_policy(name)

        # Perform the write
        if latest is None:
            return namespace(name)(self)

        return latest.nextv(self)

    def append_via_policy(self, access, complete=False):
        """
        This method is the gatekeeper for the log and can implement policies
        like "don't admit forks". It must drop the access if it doesn't meet
        the policy, and complete it if specified.

        NOTE: This is a leader-only method (followers have entries appended
        to their logs via AppendEntries) and will raise an exception if the
        node is not the leader.
        """
        if self.state != State.LEADER:
            raise RaftRPCException("Append via policies called on a follower replica!")

        # The default policy is just append anything
        # NOTE: subclasses (as in Federated) can modify this
        self.log.append(access.version, self.currentTerm)

        # Complete the access if specified by the caller.
        if complete:
            access.complete()

        # Indicate that we've successfully appended to the log
        return True

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_state_change(self):
        """
        When the state on a replica changes the internal state of the replica
        must also change, particularly the properties that define how the node
        interacts with RPC messages and client reads/writes.
        """
        if self.state in (State.FOLLOWER, State.CANDIDATE):
            self.votedFor = None
            self.nextIndex = None
            self.matchIndex = None
        elif self.state == State.CANDIDATE:
            pass
        elif self.state == State.LEADER:
            self.nextIndex = {node: self.log.lastApplied + 1 for node in self.quorum() if node != self}
            self.matchIndex = {node: 0 for node in self.quorum() if node != self}
        elif self.state == State.READY:
            # This happens on the call to super, just ignore for now.
            pass
        else:
            raise SimulationException("Unknown Raft State: {!r} set on {}".format(self.state, self))

    def on_heartbeat_timeout(self):
        """
        Callback for when a heartbeat timeout occurs, for AppendEntries RPC.
        """
        if not self.state == State.LEADER:
            return

        # Send heartbeat or aggregated writes
        self.send_append_entries()

    def on_election_timeout(self):
        """
        Callback for when an election timeout occurs, e.g. become candidate.
        """
        # Set state to candidate
        self.state = State.CANDIDATE

        # Create Election and vote for self
        self.currentTerm += 1
        self.votes = Election([node.id for node in self.quorum()])
        self.votes.vote(self.id)
        self.votedFor = self.id

        # Inform the rest of the quorum you'd like their vote.
        rpc = RequestVote(self.currentTerm, self.id, self.log.lastApplied, self.log.lastTerm)

        for follower in self.quorum():
            if follower == self:
                continue
            self.send(follower, rpc)

        # Log the newly formed candidacy
        self.sim.logger.info("{} is now a leader candidate".format(self))

    def on_request_vote_rpc(self, msg):
        """
        Callback for RequestVote RPC call.
        """
        rpc = msg.value

        if rpc.term >= self.currentTerm:
            if self.votedFor is None or self.votedFor == rpc.candidateId:
                if self.log.as_up_to_date(rpc.lastLogTerm, rpc.lastLogIndex):

                    self.sim.logger.info("{} voting for {}".format(self, rpc.candidateId))

                    self.timeout.stop()
                    self.votedFor = rpc.candidateId
                    return self.send(msg.source, VoteResponse(self.currentTerm, True))

        return self.send(msg.source, VoteResponse(self.currentTerm, False))

    def on_vote_response_rpc(self, msg):
        """
        Callback for AppendEntries and RequestVote RPC response.
        """
        rpc = msg.value

        if self.state == State.CANDIDATE:

            # Update the current election
            self.votes.vote(msg.source.id, rpc.voteGranted)
            if self.votes.has_passed():
                ## Become the leader
                self.state = State.LEADER
                self.timeout.stop()

                ## Send the leadership change append entries
                self.send_append_entries()

                ## Log the new leader
                self.sim.logger.info("{} has become raft leader".format(self))

            return

        elif self.state in (State.FOLLOWER, State.LEADER):
            # Ignore vote responses if we've already been elected.
            return

        else:
            raise RaftRPCException("Vote response in unknown state: '{}'".format(self.state))

    def on_append_entries_rpc(self, msg):
        """
        Callback for the AppendEntries RPC call.
        """
        rpc = msg.value

        # Stop the election timeout
        self.timeout.stop()

        # Reply false if term < current term
        if rpc.term < self.currentTerm:
            self.sim.logger.info("{} doesn't accept write on term {}".format(self, self.currentTerm))
            return self.send(msg.source, AEResponse(self.currentTerm, False, self.log.lastApplied, self.log.lastCommit))

        # Reply false if log doesn't contain an entry at prevLogIndex whose
        # term matches previous log term.
        if self.log.lastApplied < rpc.prevLogIndex or self.log[rpc.prevLogIndex][1] != rpc.prevLogTerm:
            if self.log.lastApplied < rpc.prevLogIndex:

                self.sim.logger.info(
                    "{} doesn't accept write on index {} where last applied is {}".format(
                        self, rpc.prevLogIndex, self.log.lastApplied
                    )
                )
            else:
                self.sim.logger.info(
                    "{} doesn't accept write for term mismatch {} vs {}".format(
                        self, rpc.prevLogTerm, self.log[rpc.prevLogIndex][1]
                    )
                )

            return self.send(msg.source, AEResponse(self.currentTerm, False, self.log.lastApplied, self.log.lastCommit))

        # At this point AppendEntries RPC is accepted
        if rpc.entries:
            if self.log.lastApplied >= rpc.prevLogIndex:
                # If existing entry conflicts with new one (same index, different terms)
                # Delete the existing entry and all that follow it.
                if self.log[rpc.prevLogIndex][1] != rpc.prevLogTerm:
                    self.log.truncate(rpc.prevLogIndex)

            if self.log.lastApplied > rpc.prevLogIndex:
                # Otherwise this could be a message that is sent again
                # raise RaftRPCException(
                #     "{} is possibly receiving a duplicate append entries!".format(self)
                # )
                self.sim.logger.warn("{} is possibly receiving a duplicate append entries!".format(self))
                return self.send(
                    msg.source, AEResponse(self.currentTerm, True, self.log.lastApplied, self.log.lastCommit)
                )

            # Append any new entries not already in the log.
            for entry in rpc.entries:
                # Add the entry/term to the log
                self.log.append(*entry)
                self.sim.logger.debug("appending {} to {} on {}".format(entry[0], entry[1], self))

                # Update the versions to compute visibilities
                entry[0].update(self)

            # Log the last write from the append entries.
            self.sim.logger.debug(
                "{} writes {} at idx {} (term {}, commit {})".format(
                    self, self.log.lastVersion, self.log.lastApplied, self.log.lastTerm, self.log.commitIndex
                )
            )

        # If leaderCommit > commitIndex, update commit Index
        if rpc.leaderCommit > self.log.commitIndex:
            self.log.commitIndex = min(rpc.leaderCommit, self.log.lastApplied)

        # Return success response.
        return self.send(msg.source, AEResponse(self.currentTerm, True, self.log.lastApplied, self.log.lastCommit))

    def on_ae_response_rpc(self, msg):
        """
        Handles acknowledgment of append entries message.
        """
        rpc = msg.value

        if self.state == State.LEADER:

            if rpc.success:
                self.nextIndex[msg.source] = rpc.lastLogIndex + 1
                self.matchIndex[msg.source] = rpc.lastLogIndex

            else:
                # Decrement next index and retry append entries
                # Ensure to floor the nextIndex to 1 (the start of the log).
                nidx = self.nextIndex[msg.source] - 1
                self.nextIndex[msg.source] = max(nidx, 1)
                self.send_append_entries(msg.source)

            # Decide if we can commit the entry
            for n in xrange(self.log.lastApplied, self.log.commitIndex, -1):
                commit = Election(self.matchIndex.keys())
                for k, v in self.matchIndex.iteritems():
                    commit.vote(k, v >= n)

                if commit.has_passed() and self.log[n][1] == self.currentTerm:
                    # Commit all versions from the last log entry to now.
                    for idx in xrange(self.log.commitIndex, n + 1):
                        if self.log[idx][0] is None:
                            continue
                        forte = True if settings.simulation.forte_on_commit else False
                        self.log[idx][0].update(self, commit=True, forte=forte)

                    # Set the commit index and break
                    self.log.commitIndex = n
                    break

        elif self.state == State.CANDIDATE:

            # Decide whether or not to step down.
            if rpc.term >= self.currentTerm:
                ## Become a follower
                self.state = State.FOLLOWER

                ## Log the failed election
                self.sim.logger.info("{} has stepped down as candidate".format(self))

                return

        elif self.state == State.FOLLOWER:
            # Ignore AE messages if we are the follower.
            return

        else:
            raise RaftRPCException("Append entries response in unknown state: '{}'".format(self.state))

    def on_remote_write_rpc(self, message):
        """
        Unpacks the version from the remote write and initiates a local write.
        """

        # Write the access from the remote replica
        access = message.value.version
        self.write(access)

        # Check if the access was dropped (e.g. the write failed)
        success = not access.is_dropped()

        # Send the write response
        self.send(message.source, WriteResponse(self.currentTerm, success, access))

    def on_write_response_rpc(self, message):
        """
        Completes the write if the remote write was successful.
        """
        rpc = message.value
        if rpc.success:
            rpc.access.complete()
Example #5
0
class FloatedRaftReplica(RaftReplica):
    def __init__(self, simulation, **kwargs):
        ## Initialize the replica
        super(FloatedRaftReplica, self).__init__(simulation, **kwargs)

        # Anti entropy settings
        self.ae_delay = kwargs.get('anti_entropy_delay', ANTI_ENTROPY_DELAY)
        self.ae_timer = None
        self.ae_cache = []

    @memoized
    def locations(self):
        """
        Returns all the locations in the network with Raft nodes.
        """
        return set(
            [node.location for node in self.neighbors(self.consistency)])

    def quorum(self):
        """
        Returns only nodes in the same location to do Raft consensus with.
        """

        # Filter only connections that are in the same consistency group
        for node in self.neighbors(self.consistency):
            if node.location == self.location:
                yield node

        # Don't forget to yield self!
        yield self

    def remotes(self, location=None):
        """
        Returns only nodes that are not in the same location to float writes
        to using anti-entropy. This method is only used by the leader.
        Can also specify a specific location to fetch the remotes for. Note
        that specifying your current location will not return nodes.
        """

        # Filter only connections that are in the same consistency group
        for node in self.neighbors(self.consistency):
            if node.location != self.location:
                if location is not None and node.location != location:
                    continue
                yield node

    def gossip(self):
        """
        Randomly select a neighbor and exchange information about the state
        of the latest entries in the log since the last anti-entropy delay.
        """

        # Gossip to one node at each location
        for location in self.locations:
            # Don't gossip to nodes in self!
            if location == self.location: continue

            # Select a random target to gossip to
            target = random.choice(list(self.remotes(location)))

            # Log the gossip that's happening
            self.sim.logger.debug("{} gossiping {} entries to {}".format(
                self, len(self.ae_cache), target))

            entries = tuple([
                Write(version.name, self, version) for version in self.ae_cache
            ])

            # Send all the values in the cache.
            self.send(target, Gossip(entries, len(self.ae_cache), -1))

        # Empty the cache on gossip
        self.ae_cache = []

        # Reset the anti-entropy timer
        self.ae_timer = Timer(self.env, self.ae_delay, self.gossip)
        self.ae_timer.start()

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_state_change(self):
        """
        Does the same stuff as super, but also - if leader; starts the anti
        entropy interval to do gossiping.
        """
        super(FloatedRaftReplica, self).on_state_change()

        if self.state in (State.FOLLOWER, State.CANDIDATE):
            if hasattr(self, 'ae_timer') and self.ae_timer is not None:
                # Cancel the anti-entropy timer.
                self.ae_timer.stop()
                self.ae_timer = None
        elif self.state == State.LEADER:
            self.ae_timer = Timer(self.env, self.ae_delay, self.gossip)
            self.ae_timer.start()
        elif self.state == State.READY:
            # This happens on the call to super, just ignore for now.
            pass
        else:
            raise SimulationException(
                "Unknown Floating Raft State: {!r} set on {}".format(
                    self.state, self))

    def on_gossip_rpc(self, message):
        """
        Handles the receipt of a gossip from another node. Expects multiple
        accesses (Write events) as entries. Goes through all and compares the
        versions, replying False only if there is an error or a conflict.
        """
        entries = message.value.entries

        # Go through the entries from the RPC and write to local cluster.
        for access in entries:
            access.version.gossiped = True
            self.write(access)

        # Should we return with what's in our cache?
        # Respond to the sender
        self.send(message.source, GossipResponse([], 0, True, -1))

    def on_response_rpc(self, message):
        """
        Just receives the acknowledgment of the response.
        """
        pass

    def on_ae_response_rpc(self, msg):
        """
        Does the same stuff that the super handler does, but also caches
        commits to gossip about them later!
        """
        rpc = msg.value

        if self.state == State.LEADER:

            if rpc.success:
                self.nextIndex[msg.source] = rpc.lastLogIndex + 1
                self.matchIndex[msg.source] = rpc.lastLogIndex

            else:
                # Decrement next index and retry append entries
                self.nextIndex[msg.source] -= 1
                self.send_append_entries(msg.source)

            # Decide if we can commit the entry
            for n in xrange(self.log.lastApplied, self.log.commitIndex, -1):
                commit = Election(self.matchIndex.keys())
                for k, v in self.matchIndex.iteritems():
                    commit.vote(k, v >= n)

                if commit.has_passed() and self.log[n][1] == self.currentTerm:
                    # Commit all versions from the last log entry to now.
                    for idx in xrange(self.log.commitIndex, n + 1):
                        if self.log[idx][0] is None: continue

                        # Cache the version to anti-entropy!
                        version = self.log[idx][0]
                        if not hasattr(version,
                                       'gossiped') or not version.gossiped:
                            self.ae_cache.append(version)

                        self.log[idx][0].update(self, commit=True)

                    # Set the commit index and break
                    self.log.commitIndex = n
                    break

        elif self.state == State.CANDIDATE:

            # Decide whether or not to step down.
            if rpc.term >= self.currentTerm:
                ## Become a follower
                self.state = State.FOLLOWER

                ## Log the failed election
                self.sim.logger.info(
                    "{} has stepped down as candidate".format(self))

                return

        elif self.state == State.FOLLOWER:
            # Ignore AE messages if we are the follower.
            return

        else:
            raise RaftRPCException(
                "Append entries response in unknown state: '{}'".format(
                    self.state))
Example #6
0
class RaftReplica(ConsensusReplica):

    def __init__(self, simulation, **kwargs):
        ## Initialize the replica
        super(RaftReplica, self).__init__(simulation, **kwargs)

        ## Initialize Raft Specific settings
        self.state       = State.FOLLOWER
        self.currentTerm = 0
        self.votedFor    = None
        self.log         = MultiObjectWriteLog()
        self.cache       = {}

        ## Policies
        self.read_policy = ReadPolicy.get(kwargs.get('read_policy', READ_POLICY))
        self.aggregate_writes = kwargs.get('aggregate_writes', AGGREGATE_WRITES)

        ## Timers for work
        eto = kwargs.get('election_timeout', ELECTION_TIMEOUT)
        hbt = kwargs.get('heartbeat_interval', HEARTBEAT_INTERVAL)

        self.timeout     = ElectionTimer.fromReplica(self, eto)
        self.heartbeat   = Timer(self.env, hbt, self.on_heartbeat_timeout)

        ## Leader state
        self.nextIndex   = None
        self.matchIndex  = None

    ######################################################################
    ## Core Methods (Replica API)
    ######################################################################

    def recv(self, event):
        """
        Before dispatching the message to an RPC specific handler, there are
        some message-wide checks that need to occur. In this case the term
        must be inspected and if the replica is behind, become follower.
        """
        message = event.value
        rpc = message.value

        # If RPC request or response contains term > currentTerm
        # Set currentTerm to term and convert to follower.
        if rpc.term > self.currentTerm:
            self.state = State.FOLLOWER
            self.currentTerm = rpc.term

        # Record the received message and dispatch to event handler
        return super(RaftReplica, self).recv(event)

    def read(self, name, **kwargs):
        """
        Raft nodes perform a local read of the most recent commited version
        for the name passed in. Because the committed version could be stale
        (a new version is still waiting for 2 phase commit) a fork is possible
        but the Raft group will maintain full linearizability.
        """
        # Create the read event using super.
        access = super(RaftReplica, self).read(name, **kwargs)

        # Record the number of attempts for the access
        if access.is_local_to(self): access.attempts += 1

        # NOTE: Formerly, this was ALWAYS read commit not read latest, now
        # it is set by the read policy on the replica. We previously noted that
        # read committed was one of the key differences from eventual.
        version = self.read_via_policy(access.name)

        # If the version is None, that we haven't read anything!
        if version is None: return access.drop(empty=True)

        # Because this is a local read committed, complete the read.
        access.update(version, completed=True)

        # Log the access from this particular replica.
        access.log(self)

        return access

    def write(self, name, **kwargs):
        """
        The write can be initiated on any replica server, including followers.
        Step one is to create the access event using super, which will give us
        the ability to detect local vs. remote writes.

        If the write is local:
        - create a new version from the latest write.
        - if follower: send a RemoteWrite with new version to the leader (write latency)
                store a cache copy so that followers can read their own writes.
                cached copy of the write goes away on AppendEntries.
        - if leader: append to log and complete (no leader latency)

        If the write is remote:
        - if follower: log warning and forward to leader
        - if leader: append to log but do not complete (complete at local)

        Check the committed vs. latest new versions.

        After local vs. remote do the following:

        1. update the version for visibility latency
        2. if leader send append entries
        """
        access = super(RaftReplica, self).write(name, **kwargs)

        # Determine if the write is local or remote
        if access.is_local_to(self):
            # Record the number of attempts for the access
            access.attempts += 1

            # Write a new version to the latest read by policy
            version = self.write_via_policy(access.name)

            # Update the access with the latest version
            access.update(version)

            # Log the access from this particular replica.
            access.log(self)

            if self.state == State.LEADER:
                # Append to log and complete if leader and local
                self.append_via_policy(access, complete=True)

            else:
                # Store the version in the cache and send remote write.
                self.cache[access.name] = version
                return self.send_remote_write(access)

        else:
            # Log the access from this particular replica.
            access.log(self)

            # If there is no version, raise an exception
            if access.version is None:
                raise AccessError(
                    "Attempting a remote write on {} without a version!".format(self)
                )

            # Save the version variable for use below.
            version = access.version

            if self.state == State.LEADER:
                # Append to log but do not complete since its remote
                self.append_via_policy(access, complete=False)

            else:
                # Remote write occurred from client to a follower
                self.sim.logger.info(
                    "remote write on follower node: {}".format(self)
                )

                # Store the version in the cache and send remote write.
                self.cache[access.name] = version
                return self.send_remote_write(access)

        # At this point we've dealt with local vs. remote, we should be the leader
        assert self.state == State.LEADER

        # Update the version to track visibility latency
        forte = True if settings.simulation.forte_on_append else False
        version.update(self, forte=forte)

        # Now do AppendEntries
        # Also interrupt the heartbeat since we just sent AppendEntries
        if not self.aggregate_writes:
            self.send_append_entries()
            self.heartbeat.stop()

        return access

    def run(self):
        """
        Implements the Raft consensus protocol and elections.
        """
        while True:
            if self.state in {State.FOLLOWER, State.CANDIDATE}:
                yield self.timeout.start()

            elif self.state == State.LEADER:
                yield self.heartbeat.start()

            else:
                raise SimulationException(
                    "Unknown Raft State: {!r} on {}".format(self.state, self)
                )

    ######################################################################
    ## Helper Methods
    ######################################################################

    def send_append_entries(self, target=None):
        """
        Helper function to send append entries to quorum or a specific node.

        Note: fails silently if target is not in the neighbors list.
        """
        # Leader check
        if not self.state == State.LEADER:
            return

        # Go through follower list.
        for node, nidx in self.nextIndex.iteritems():
            # Filter based on the target supplied.
            if target is not None and node != target:
                continue

            # Construct the entries, or empty for heartbeat
            entries = []
            if self.log.lastApplied >= nidx:
                entries = self.log[nidx:]

            # Compute the previous log index and term
            prevLogIndex = nidx - 1
            prevLogTerm  = self.log[prevLogIndex].term

            # Send the heartbeat message
            self.send(
                node, AppendEntries(
                    self.currentTerm, self.id, prevLogIndex,
                    prevLogTerm, entries, self.log.commitIndex
                )
            )

    def send_remote_write(self, access):
        """
        Helper function to send a remote write from a follower to leader.
        """
        # Find the leader to perform the remote write.
        leader = self.get_leader_node()

        # If not leader, then drop the write
        if not leader:
            self.sim.logger.info(
                "no leader: dropped write at {}".format(self)
            )

            return access.drop()

        # Send the remote write to the leader
        self.send(
            leader, RemoteWrite(self.currentTerm, access)
        )

        return access

    def get_leader_node(self):
        """
        Searches for the leader amongst the neighbors. Raises an exception if
        there are multiple leaders, which is an extreme edge case.
        """
        leaders = [
            node for node in self.quorum() if node.state == State.LEADER
        ]

        if len(leaders) > 1:
            raise SimulationException("MutipleLeaders?!")
        elif len(leaders) < 1:
            return None
        else:
            return leaders[0]

    def read_via_policy(self, name):
        """
        This method returns a version from either the log or the cache
        according to the read policy set on the replica server as follows:

            - COMMIT: return the latest commited version (ignoring cache)
            - LATEST: return latest version in log or in cache

        This method raises an exception on bad read policies.
        """

        # If the policy is read committed, return the latest committed version
        if self.read_policy == ReadPolicy.COMMIT:
            return self.log.get_latest_commit(name)

        # If the policy is latest, read the latest and compare to cache.
        if self.read_policy == ReadPolicy.LATEST:
            # Get the latest version from the log (committed or not)
            version = self.log.get_latest_version(name)

            # If name in the cache and the cache version is greater, return it.
            if name in self.cache and version is not None:
                if self.cache[name] > version:
                    return self.cache[name]

            # Return the latest version
            return version

        # If we've reached this point, we don't know what to do!
        raise SimulationException("Unknown read policy!")

    def write_via_policy(self, name):
        """
        This method returns a new version incremented from either from the
        log or from the cache according to the read policy. It also handles
        any "new" writes, e.g. to objects that haven't been written yet.
        """
        # Fetch the version from the log or the cache according to the
        # read policy. This implements READ COMMITTED/READ LATEST
        latest = self.read_via_policy(name)

        # Perform the write
        if latest is None:
            return namespace(name)(self)

        return latest.nextv(self)

    def append_via_policy(self, access, complete=False):
        """
        This method is the gatekeeper for the log and can implement policies
        like "don't admit forks". It must drop the access if it doesn't meet
        the policy, and complete it if specified.

        NOTE: This is a leader-only method (followers have entries appended
        to their logs via AppendEntries) and will raise an exception if the
        node is not the leader.
        """
        if self.state != State.LEADER:
            raise RaftRPCException(
                "Append via policies called on a follower replica!"
            )

        # The default policy is just append anything
        # NOTE: subclasses (as in Federated) can modify this
        self.log.append(access.version, self.currentTerm)

        # Complete the access if specified by the caller.
        if complete:
            access.complete()

        # Indicate that we've successfully appended to the log
        return True

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_state_change(self):
        """
        When the state on a replica changes the internal state of the replica
        must also change, particularly the properties that define how the node
        interacts with RPC messages and client reads/writes.
        """
        if self.state in (State.FOLLOWER, State.CANDIDATE):
            self.votedFor    = None
            self.nextIndex   = None
            self.matchIndex  = None
        elif self.state == State.CANDIDATE:
            pass
        elif self.state == State.LEADER:
            self.nextIndex   = {node: self.log.lastApplied + 1 for node in self.quorum() if node != self}
            self.matchIndex  = {node: 0 for node in self.quorum() if node != self}
        elif self.state == State.READY:
            # This happens on the call to super, just ignore for now.
            pass
        else:
            raise SimulationException(
                "Unknown Raft State: {!r} set on {}".format(self.state, self)
            )

    def on_heartbeat_timeout(self):
        """
        Callback for when a heartbeat timeout occurs, for AppendEntries RPC.
        """
        if not self.state == State.LEADER:
            return

        # Send heartbeat or aggregated writes
        self.send_append_entries()

    def on_election_timeout(self):
        """
        Callback for when an election timeout occurs, e.g. become candidate.
        """
        # Set state to candidate
        self.state = State.CANDIDATE

        # Create Election and vote for self
        self.currentTerm += 1
        self.votes = Election([node.id for node in self.quorum()])
        self.votes.vote(self.id)
        self.votedFor = self.id

        # Inform the rest of the quorum you'd like their vote.
        rpc = RequestVote(
            self.currentTerm, self.id, self.log.lastApplied, self.log.lastTerm
        )

        for follower in self.quorum():
            if follower == self: continue
            self.send(
                follower, rpc
            )

        # Log the newly formed candidacy
        self.sim.logger.info(
            "{} is now a leader candidate".format(self)
        )

    def on_request_vote_rpc(self, msg):
        """
        Callback for RequestVote RPC call.
        """
        rpc = msg.value

        if rpc.term >= self.currentTerm:
            if self.votedFor is None or self.votedFor == rpc.candidateId:
                if self.log.as_up_to_date(rpc.lastLogTerm, rpc.lastLogIndex):

                    self.sim.logger.info(
                        "{} voting for {}".format(self, rpc.candidateId)
                    )

                    self.timeout.stop()
                    self.votedFor = rpc.candidateId
                    return self.send(
                        msg.source, VoteResponse(self.currentTerm, True)
                    )

        return self.send(
            msg.source, VoteResponse(self.currentTerm, False)
        )

    def on_vote_response_rpc(self, msg):
        """
        Callback for AppendEntries and RequestVote RPC response.
        """
        rpc = msg.value

        if self.state == State.CANDIDATE:

            # Update the current election
            self.votes.vote(msg.source.id, rpc.voteGranted)
            if self.votes.has_passed():
                ## Become the leader
                self.state = State.LEADER
                self.timeout.stop()

                ## Send the leadership change append entries
                self.send_append_entries()

                ## Log the new leader
                self.sim.logger.info(
                    "{} has become raft leader".format(self)
                )

            return

        elif self.state in (State.FOLLOWER, State.LEADER):
            # Ignore vote responses if we've already been elected.
            return

        else:
            raise RaftRPCException(
                "Vote response in unknown state: '{}'".format(self.state)
            )

    def on_append_entries_rpc(self, msg):
        """
        Callback for the AppendEntries RPC call.
        """
        rpc = msg.value

        # Stop the election timeout
        self.timeout.stop()

        # Reply false if term < current term
        if rpc.term < self.currentTerm:
            self.sim.logger.info("{} doesn't accept write on term {}".format(self, self.currentTerm))
            return self.send(
                msg.source, AEResponse(self.currentTerm, False, self.log.lastApplied, self.log.lastCommit)
            )

        # Reply false if log doesn't contain an entry at prevLogIndex whose
        # term matches previous log term.
        if self.log.lastApplied < rpc.prevLogIndex or self.log[rpc.prevLogIndex][1] != rpc.prevLogTerm:
            if self.log.lastApplied < rpc.prevLogIndex:

                self.sim.logger.info(
                    "{} doesn't accept write on index {} where last applied is {}".format(
                        self, rpc.prevLogIndex, self.log.lastApplied
                    )
                )
            else:
                self.sim.logger.info(
                    "{} doesn't accept write for term mismatch {} vs {}".format(
                        self, rpc.prevLogTerm, self.log[rpc.prevLogIndex][1]
                    )
                )

            return self.send(
                msg.source, AEResponse(self.currentTerm, False, self.log.lastApplied, self.log.lastCommit)
            )

        # At this point AppendEntries RPC is accepted
        if rpc.entries:
            if self.log.lastApplied >= rpc.prevLogIndex:
                # If existing entry conflicts with new one (same index, different terms)
                # Delete the existing entry and all that follow it.
                if self.log[rpc.prevLogIndex][1] != rpc.prevLogTerm:
                    self.log.truncate(rpc.prevLogIndex)

            if self.log.lastApplied > rpc.prevLogIndex:
                # Otherwise this could be a message that is sent again
                # raise RaftRPCException(
                #     "{} is possibly receiving a duplicate append entries!".format(self)
                # )
                self.sim.logger.warn(
                    "{} is possibly receiving a duplicate append entries!".format(self)
                )
                return self.send(msg.source, AEResponse(self.currentTerm, True, self.log.lastApplied, self.log.lastCommit))


            # Append any new entries not already in the log.
            for entry in rpc.entries:
                # Add the entry/term to the log
                self.log.append(*entry)
                self.sim.logger.debug(
                    "appending {} to {} on {}".format(entry[0], entry[1], self)
                )

                # Update the versions to compute visibilities
                entry[0].update(self)

            # Log the last write from the append entries.
            self.sim.logger.debug(
                "{} writes {} at idx {} (term {}, commit {})".format(
                self, self.log.lastVersion, self.log.lastApplied, self.log.lastTerm, self.log.commitIndex
            ))

        # If leaderCommit > commitIndex, update commit Index
        if rpc.leaderCommit > self.log.commitIndex:
            self.log.commitIndex = min(rpc.leaderCommit, self.log.lastApplied)

        # Return success response.
        return self.send(msg.source, AEResponse(self.currentTerm, True, self.log.lastApplied, self.log.lastCommit))

    def on_ae_response_rpc(self, msg):
        """
        Handles acknowledgment of append entries message.
        """
        rpc = msg.value

        if self.state == State.LEADER:

            if rpc.success:
                self.nextIndex[msg.source]  = rpc.lastLogIndex + 1
                self.matchIndex[msg.source] = rpc.lastLogIndex

            else:
                # Decrement next index and retry append entries
                # Ensure to floor the nextIndex to 1 (the start of the log).
                nidx = self.nextIndex[msg.source] - 1
                self.nextIndex[msg.source] = max(nidx, 1)
                self.send_append_entries(msg.source)

            # Decide if we can commit the entry
            for n in xrange(self.log.lastApplied, self.log.commitIndex, -1):
                commit = Election(self.matchIndex.keys())
                for k, v in self.matchIndex.iteritems():
                    commit.vote(k, v >= n)

                if commit.has_passed() and self.log[n][1] == self.currentTerm:
                    # Commit all versions from the last log entry to now.
                    for idx in xrange(self.log.commitIndex, n+1):
                        if self.log[idx][0] is None: continue
                        forte = True if settings.simulation.forte_on_commit else False
                        self.log[idx][0].update(self, commit=True, forte=forte)

                    # Set the commit index and break
                    self.log.commitIndex = n
                    break

        elif self.state == State.CANDIDATE:

            # Decide whether or not to step down.
            if rpc.term >= self.currentTerm:
                ## Become a follower
                self.state = State.FOLLOWER

                ## Log the failed election
                self.sim.logger.info(
                    "{} has stepped down as candidate".format(self)
                )

                return

        elif self.state == State.FOLLOWER:
            # Ignore AE messages if we are the follower.
            return

        else:
            raise RaftRPCException(
                "Append entries response in unknown state: '{}'".format(self.state)
            )

    def on_remote_write_rpc(self, message):
        """
        Unpacks the version from the remote write and initiates a local write.
        """

        # Write the access from the remote replica
        access = message.value.version
        self.write(access)

        # Check if the access was dropped (e.g. the write failed)
        success = not access.is_dropped()

        # Send the write response
        self.send(message.source, WriteResponse(self.currentTerm, success, access))

    def on_write_response_rpc(self, message):
        """
        Completes the write if the remote write was successful.
        """
        rpc = message.value
        if rpc.success:
            rpc.access.complete()