Example #1
0
    def __init__(self, simulation, **kwargs):
        ## Initialize the replica
        super(RaftReplica, self).__init__(simulation, **kwargs)

        ## Initialize Raft Specific settings
        self.state       = State.FOLLOWER
        self.currentTerm = 0
        self.votedFor    = None
        self.log         = MultiObjectWriteLog()
        self.cache       = {}

        ## Policies
        self.read_policy = ReadPolicy.get(kwargs.get('read_policy', READ_POLICY))
        self.aggregate_writes = kwargs.get('aggregate_writes', AGGREGATE_WRITES)

        ## Timers for work
        eto = kwargs.get('election_timeout', ELECTION_TIMEOUT)
        hbt = kwargs.get('heartbeat_interval', HEARTBEAT_INTERVAL)

        self.timeout     = ElectionTimer.fromReplica(self, eto)
        self.heartbeat   = Timer(self.env, hbt, self.on_heartbeat_timeout)

        ## Leader state
        self.nextIndex   = None
        self.matchIndex  = None
Example #2
0
    def gossip(self):
        """
        Randomly select a neighbor and exchange information about the state
        of the latest entries in the log since the last anti-entropy delay.
        """

        # Gossip to one node at each location
        for location in self.locations:
            # Don't gossip to nodes in self!
            if location == self.location: continue

            # Select a random target to gossip to
            target = random.choice(list(self.remotes(location)))

            # Log the gossip that's happening
            self.sim.logger.debug("{} gossiping {} entries to {}".format(
                self, len(self.ae_cache), target))

            entries = tuple([
                Write(version.name, self, version) for version in self.ae_cache
            ])

            # Send all the values in the cache.
            self.send(target, Gossip(entries, len(self.ae_cache), -1))

        # Empty the cache on gossip
        self.ae_cache = []

        # Reset the anti-entropy timer
        self.ae_timer = Timer(self.env, self.ae_delay, self.gossip)
        self.ae_timer.start()
Example #3
0
 def get_anti_entropy_timeout(self):
     """
     Creates the anti-entropy timeout.
     In the future this could be random timeout not fixed.
     """
     self.timeout = Timer(self.env, self.ae_delay, self.gossip)
     return self.timeout.start()
Example #4
0
 def handle_session(self):
     """
     Starts a session timer if one isn't running, otherwise resets the
     currently running session timer on an additional access.
     """
     if not self.session:
         self.session = Timer(
             self.env, self.session_timeout,
             partial(self.on_session_timeout, self.env.now))
     else:
         self.session = self.session.reset()
Example #5
0
 def run(self):
     """
     We have to check in at every heartbeat interval. If we own a tag then
     send a heartbeat message, otherwise just keep quiescing.
     """
     while True:
         if self.state == State.OWNER:
             self.heartbeat = Timer(self.env, self.heartbeat_interval,
                                    self.on_heartbeat_timeout)
             yield self.heartbeat.start()
         else:
             yield self.env.timeout(self.heartbeat_interval)
Example #6
0
    def gossip(self):
        """
        Randomly select a neighbor and exchange information about the state
        of the latest entries in the log since the last anti-entropy delay.
        """

        # Gossip to one node at each location
        for location in self.locations:
            # Don't gossip to nodes in self!
            if location == self.location: continue

            # Select a random target to gossip to
            target = random.choice(list(self.remotes(location)))

            # Log the gossip that's happening
            self.sim.logger.debug(
                "{} gossiping {} entries to {}".format(
                    self, len(self.ae_cache), target
                )
            )

            entries = tuple([
                Write(version.name, self, version)
                for version in self.ae_cache
            ])

            # Send all the values in the cache.
            self.send(target, Gossip(entries, len(self.ae_cache), -1))

        # Empty the cache on gossip
        self.ae_cache = []

        # Reset the anti-entropy timer
        self.ae_timer = Timer(self.env, self.ae_delay, self.gossip)
        self.ae_timer.start()
Example #7
0
    def __init__(self, simulation, **kwargs):
        ## Initialize the replica
        super(RaftReplica, self).__init__(simulation, **kwargs)

        ## Initialize Raft Specific settings
        self.state = State.FOLLOWER
        self.currentTerm = 0
        self.votedFor = None
        self.log = MultiObjectWriteLog()
        self.cache = {}

        ## Policies
        self.read_policy = ReadPolicy.get(kwargs.get("read_policy", READ_POLICY))
        self.aggregate_writes = kwargs.get("aggregate_writes", AGGREGATE_WRITES)

        ## Timers for work
        eto = kwargs.get("election_timeout", ELECTION_TIMEOUT)
        hbt = kwargs.get("heartbeat_interval", HEARTBEAT_INTERVAL)

        self.timeout = ElectionTimer.fromReplica(self, eto)
        self.heartbeat = Timer(self.env, hbt, self.on_heartbeat_timeout)

        ## Leader state
        self.nextIndex = None
        self.matchIndex = None
Example #8
0
 def get_anti_entropy_timeout(self):
     """
     Creates the anti-entropy timeout.
     In the future this could be random timeout not fixed.
     """
     self.timeout = Timer(self.env, self.ae_delay, self.gossip)
     return self.timeout.start()
Example #9
0
 def handle_session(self):
     """
     Starts a session timer if one isn't running, otherwise resets the
     currently running session timer on an additional access.
     """
     if not self.session:
         self.session = Timer(
             self.env, self.session_timeout,
             partial(self.on_session_timeout, self.env.now)
         )
     else:
         self.session = self.session.reset()
Example #10
0
    def on_state_change(self):
        """
        Does the same stuff as super, but also - if leader; starts the anti
        entropy interval to do gossiping.
        """
        super(FloatedRaftReplica, self).on_state_change()

        if self.state in (State.FOLLOWER, State.CANDIDATE):
            if hasattr(self, 'ae_timer') and self.ae_timer is not None:
                # Cancel the anti-entropy timer.
                self.ae_timer.stop()
                self.ae_timer = None
        elif self.state == State.LEADER:
            self.ae_timer = Timer(self.env, self.ae_delay, self.gossip)
            self.ae_timer.start()
        elif self.state == State.READY:
            # This happens on the call to super, just ignore for now.
            pass
        else:
            raise SimulationException(
                "Unknown Floating Raft State: {!r} set on {}".format(
                    self.state, self))
Example #11
0
 def run(self):
     """
     We have to check in at every heartbeat interval. If we own a tag then
     send a heartbeat message, otherwise just keep quiescing.
     """
     while True:
         if self.state == State.OWNER:
             self.heartbeat = Timer(
                 self.env, self.heartbeat_interval, self.on_heartbeat_timeout
             )
             yield self.heartbeat.start()
         else:
             yield self.env.timeout(self.heartbeat_interval)
Example #12
0
    def on_state_change(self):
        """
        Does the same stuff as super, but also - if leader; starts the anti
        entropy interval to do gossiping.
        """
        super(FloatedRaftReplica, self).on_state_change()

        if self.state in (State.FOLLOWER, State.CANDIDATE):
            if hasattr(self, 'ae_timer') and self.ae_timer is not None:
                # Cancel the anti-entropy timer.
                self.ae_timer.stop()
                self.ae_timer = None
        elif self.state == State.LEADER:
            self.ae_timer = Timer(self.env, self.ae_delay, self.gossip)
            self.ae_timer.start()
        elif self.state == State.READY:
            # This happens on the call to super, just ignore for now.
            pass
        else:
            raise SimulationException(
                "Unknown Floating Raft State: {!r} set on {}".format(self.state, self)
            )
Example #13
0
class FloatedRaftReplica(RaftReplica):

    def __init__(self, simulation, **kwargs):
        ## Initialize the replica
        super(FloatedRaftReplica, self).__init__(simulation, **kwargs)

        # Anti entropy settings
        self.ae_delay = kwargs.get('anti_entropy_delay', ANTI_ENTROPY_DELAY)
        self.ae_timer = None
        self.ae_cache = []

    @memoized
    def locations(self):
        """
        Returns all the locations in the network with Raft nodes.
        """
        return set([
            node.location for node in self.neighbors(self.consistency)
        ])

    def quorum(self):
        """
        Returns only nodes in the same location to do Raft consensus with.
        """

        # Filter only connections that are in the same consistency group
        for node in self.neighbors(self.consistency):
            if node.location == self.location:
                yield node

        # Don't forget to yield self!
        yield self

    def remotes(self, location=None):
        """
        Returns only nodes that are not in the same location to float writes
        to using anti-entropy. This method is only used by the leader.
        Can also specify a specific location to fetch the remotes for. Note
        that specifying your current location will not return nodes.
        """

        # Filter only connections that are in the same consistency group
        for node in self.neighbors(self.consistency):
            if node.location != self.location:
                if location is not None and node.location != location:
                    continue
                yield node

    def gossip(self):
        """
        Randomly select a neighbor and exchange information about the state
        of the latest entries in the log since the last anti-entropy delay.
        """

        # Gossip to one node at each location
        for location in self.locations:
            # Don't gossip to nodes in self!
            if location == self.location: continue

            # Select a random target to gossip to
            target = random.choice(list(self.remotes(location)))

            # Log the gossip that's happening
            self.sim.logger.debug(
                "{} gossiping {} entries to {}".format(
                    self, len(self.ae_cache), target
                )
            )

            entries = tuple([
                Write(version.name, self, version)
                for version in self.ae_cache
            ])

            # Send all the values in the cache.
            self.send(target, Gossip(entries, len(self.ae_cache), -1))

        # Empty the cache on gossip
        self.ae_cache = []

        # Reset the anti-entropy timer
        self.ae_timer = Timer(self.env, self.ae_delay, self.gossip)
        self.ae_timer.start()

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_state_change(self):
        """
        Does the same stuff as super, but also - if leader; starts the anti
        entropy interval to do gossiping.
        """
        super(FloatedRaftReplica, self).on_state_change()

        if self.state in (State.FOLLOWER, State.CANDIDATE):
            if hasattr(self, 'ae_timer') and self.ae_timer is not None:
                # Cancel the anti-entropy timer.
                self.ae_timer.stop()
                self.ae_timer = None
        elif self.state == State.LEADER:
            self.ae_timer = Timer(self.env, self.ae_delay, self.gossip)
            self.ae_timer.start()
        elif self.state == State.READY:
            # This happens on the call to super, just ignore for now.
            pass
        else:
            raise SimulationException(
                "Unknown Floating Raft State: {!r} set on {}".format(self.state, self)
            )

    def on_gossip_rpc(self, message):
        """
        Handles the receipt of a gossip from another node. Expects multiple
        accesses (Write events) as entries. Goes through all and compares the
        versions, replying False only if there is an error or a conflict.
        """
        entries = message.value.entries

        # Go through the entries from the RPC and write to local cluster.
        for access in entries:
            access.version.gossiped = True
            self.write(access)

        # Should we return with what's in our cache?
        # Respond to the sender
        self.send(message.source, GossipResponse([], 0, True, -1))

    def on_response_rpc(self, message):
        """
        Just receives the acknowledgment of the response.
        """
        pass

    def on_ae_response_rpc(self, msg):
        """
        Does the same stuff that the super handler does, but also caches
        commits to gossip about them later!
        """
        rpc = msg.value

        if self.state == State.LEADER:

            if rpc.success:
                self.nextIndex[msg.source]  = rpc.lastLogIndex + 1
                self.matchIndex[msg.source] = rpc.lastLogIndex

            else:
                # Decrement next index and retry append entries
                self.nextIndex[msg.source] -= 1
                self.send_append_entries(msg.source)

            # Decide if we can commit the entry
            for n in xrange(self.log.lastApplied, self.log.commitIndex, -1):
                commit = Election(self.matchIndex.keys())
                for k, v in self.matchIndex.iteritems():
                    commit.vote(k, v >= n)

                if commit.has_passed() and self.log[n][1] == self.currentTerm:
                    # Commit all versions from the last log entry to now.
                    for idx in xrange(self.log.commitIndex, n+1):
                        if self.log[idx][0] is None: continue

                        # Cache the version to anti-entropy!
                        version = self.log[idx][0]
                        if not hasattr(version, 'gossiped') or not version.gossiped:
                            self.ae_cache.append(version)

                        self.log[idx][0].update(self, commit=True)

                    # Set the commit index and break
                    self.log.commitIndex = n
                    break

        elif self.state == State.CANDIDATE:

            # Decide whether or not to step down.
            if rpc.term >= self.currentTerm:
                ## Become a follower
                self.state = State.FOLLOWER

                ## Log the failed election
                self.sim.logger.info(
                    "{} has stepped down as candidate".format(self)
                )

                return

        elif self.state == State.FOLLOWER:
            # Ignore AE messages if we are the follower.
            return

        else:
            raise RaftRPCException(
                "Append entries response in unknown state: '{}'".format(self.state)
            )
Example #14
0
    def read(self, name, **kwargs):
        """
        When a tag replica performs a read it has to decide whether or not to
        read locally or to make a remote read across the cluster.

        Convert the read into an access, then check if we own the object.
        If we do, then return the latest commit.
        If we don't and no one else does either, attempt to acquire the tag.
        If we don't and someone else does then either drop, wait, or remote.

        Current implementation: #2, MR, no remote access.
        If someone else owns tag, reads are dropped.

        TODO: Remote vs Local Reads
        """
        # Create the read event using super.
        access = super(TagReplica, self).read(name, **kwargs)

        # Record the number of attempts for the access
        if access.is_local_to(self): access.attempts += 1

        # Increase the session on access.
        self.handle_session()

        # Are we the owner of this tag?
        if self.owns(access.name):
            # TODO: Change to last commit!
            version = self.log[access.name].lastVersion

            # If the version is None, bail since we haven't read anything
            if version is None: return access.drop(empty=True)

            # Update the version, complete the read, and log the access
            access.update(version, completed=True)
            access.log(self)

            # Return, we're done reading!
            return access

        # Is there a different owner for the tag?
        owner = self.find_owner(access.name)
        if owner is not None:
            # Right now just drop the read on its face.
            self.sim.logger.info("ownership conflict: dropped {} at {}".format(
                access, self))
            return access.drop()

        # We're going to acquire the tag!
        else:
            # Log the access from this particular replica.
            access.log(self)

            # We're going to have some read latency, retry the read.
            retry = Timer(self.env, self.heartbeat_interval,
                          lambda: self.read(access)).start()

            if access.attempts <= 1 and self.state != State.TAGGING:
                # Request the ownership of the tag
                self.acquire(access.name)

        return access
Example #15
0
class TagReplica(ConsensusReplica):
    def __init__(self, simulation, **kwargs):
        ## Timers for work
        self.session_timeout = kwargs.get('session_timeout', SESSION_TIMEOUT)
        self.heartbeat_interval = kwargs.get('heartbeat_interval',
                                             HEARTBEAT_INTERVAL)
        self.session = None
        self.heartbeat = None

        ## Initialze the tag specific settings
        self.epoch = 0
        self.log = defaultdict(WriteLog)
        self.view = defaultdict(set)

        ## Owner state
        self.nextIndex = None
        self.matchIndex = None

        ## Initialize the replica
        super(TagReplica, self).__init__(simulation, **kwargs)
        self.state = State.READY

    ######################################################################
    ## Core Methods (Replica API)
    ######################################################################

    def read(self, name, **kwargs):
        """
        When a tag replica performs a read it has to decide whether or not to
        read locally or to make a remote read across the cluster.

        Convert the read into an access, then check if we own the object.
        If we do, then return the latest commit.
        If we don't and no one else does either, attempt to acquire the tag.
        If we don't and someone else does then either drop, wait, or remote.

        Current implementation: #2, MR, no remote access.
        If someone else owns tag, reads are dropped.

        TODO: Remote vs Local Reads
        """
        # Create the read event using super.
        access = super(TagReplica, self).read(name, **kwargs)

        # Record the number of attempts for the access
        if access.is_local_to(self): access.attempts += 1

        # Increase the session on access.
        self.handle_session()

        # Are we the owner of this tag?
        if self.owns(access.name):
            # TODO: Change to last commit!
            version = self.log[access.name].lastVersion

            # If the version is None, bail since we haven't read anything
            if version is None: return access.drop(empty=True)

            # Update the version, complete the read, and log the access
            access.update(version, completed=True)
            access.log(self)

            # Return, we're done reading!
            return access

        # Is there a different owner for the tag?
        owner = self.find_owner(access.name)
        if owner is not None:
            # Right now just drop the read on its face.
            self.sim.logger.info("ownership conflict: dropped {} at {}".format(
                access, self))
            return access.drop()

        # We're going to acquire the tag!
        else:
            # Log the access from this particular replica.
            access.log(self)

            # We're going to have some read latency, retry the read.
            retry = Timer(self.env, self.heartbeat_interval,
                          lambda: self.read(access)).start()

            if access.attempts <= 1 and self.state != State.TAGGING:
                # Request the ownership of the tag
                self.acquire(access.name)

        return access

    def write(self, name, **kwargs):
        """
        When a replica performs a write it needs to decide if it can write to
        the tag locally, can acquire a tag for this object, or if it has to do
        something else like drop, wait, or remote write.

        If the access is local:

            - if the replica owns the tag, append and complete
            - if someone else owns the tag then drop, wait, or remote
            - if no one owns the tag, then attempt to acquire it

        If access is remote:

            - if we own the tag, then append but do not complete (at local)
            - if someone else owns the tag, log and forward to owner
            - if no one owns the tag then respond false
        """
        # Create the read event using super.
        access = super(TagReplica, self).write(name, **kwargs)

        # Increase the session on access.
        self.handle_session()

        # Determine if the write is local or remote
        if access.is_local_to(self):
            # Record the number of attempts for the access
            access.attempts += 1

            # Fetch the latest version from the log.
            latest = self.log[access.name].lastVersion

            # Perform the write
            if latest is None:
                version = namespace(access.name)(self)
            else:
                version = latest.nextv(self)

            # Update the access with the latest version
            access.update(version)

        else:
            # If there is no version, raise an exception
            if access.version is None:
                raise AccessError(
                    "Attempting a remote write on {} without a version!".
                    format(self))

            # Save the version variable for use below.
            version = access.version

        # Log the access at this replica
        access.log(self)

        # Are we the owner of this tag?
        if self.owns(access.name):
            # Perform the append entries
            self.log[name].append(version, self.epoch)
            # Update the version to track visibility latency
            version.update(self)

            # Complete the access if it was local
            if access.is_local_to(self): access.complete()

            # Now do AppendEntries
            # Also interrupt the heartbeat since we just sent AppendEntries
            if not settings.simulation.aggregate_writes:
                self.send_append_entries()
                if self.heartbeat: self.heartbeat.stop()

            return access

        # Is there a different owner for the tag?
        owner = self.find_owner(name)
        if owner is not None:
            # Right now just drop the write on its face.
            self.sim.logger.info("ownership conflict: dropped {} at {}".format(
                access, self))
            return access.drop()

        # We're going to acquire the tag!
        else:
            # We're going to have some write latency, retry the write.
            retry = Timer(self.env, self.heartbeat_interval,
                          lambda: self.write(access)).start()

            # Request the ownership of the tag
            self.acquire(access.name)

        return access

    def run(self):
        """
        We have to check in at every heartbeat interval. If we own a tag then
        send a heartbeat message, otherwise just keep quiescing.
        """
        while True:
            if self.state == State.OWNER:
                self.heartbeat = Timer(self.env, self.heartbeat_interval,
                                       self.on_heartbeat_timeout)
                yield self.heartbeat.start()
            else:
                yield self.env.timeout(self.heartbeat_interval)

    ######################################################################
    ## Helper Methods
    ######################################################################

    def owns(self, name):
        """
        Returns True if the name is in the current view for that owner.
        """
        return name in self.view[self]

    def find_owner(self, name):
        """
        Looks up the owner of the name in the current view.
        Returns None if there is no owner fo the tag.
        """
        for owner, tag in self.view.items():
            if name in tag:
                return owner
        return None

    def acquire(self, tag):
        """
        Sends out the acquire tag RPC
        """
        # Construct the tag to send out
        if not isinstance(tag, (set, frozenset)):
            tag = frozenset([tag])

        # Make sure to request the tag we already have
        tag = frozenset(self.view[self] | tag)

        # Request tag with all current tags
        self.send_tag_request(tag)

        # Log the tag acquisition
        self.sim.logger.info("{} is atempting to acquire tag {}".format(
            self, self.tag))

    def release(self, tag=None):
        """
        Sends out the release tag RPC
        """
        # Release all currently held tags
        if tag is None: tag = self.view[self]

        # Construct the tag to send out (if specified)
        if not isinstance(tag, (set, frozenset)):
            tag = frozenset([tag])

        # Request the difference of the tags we already have
        tag = frozenset(self.view[self] - tag)

        # Request tag with all current tags
        self.send_tag_request(tag)

        # Log the tag release
        self.sim.logger.info("{} is atempting to release tag {}".format(
            self, tag))

    def handle_session(self):
        """
        Starts a session timer if one isn't running, otherwise resets the
        currently running session timer on an additional access.
        """
        if not self.session:
            self.session = Timer(
                self.env, self.session_timeout,
                partial(self.on_session_timeout, self.env.now))
        else:
            self.session = self.session.reset()

    def get_log_state(self, tag=None):
        """
        Constructs a log state object for append entries responses, either
        for the current tag or simply the current view.
        """
        if tag is None:
            tag = [obj for view in self.view.values() for obj in view]

        return {
            obj: LogState(self.log[obj].lastApplied, self.log[obj].lastTerm,
                          self.log[obj].commitIndex)
            for obj in tag
        }

    def send_tag_request(self, tag):
        """
        Broadcasts a tag request for the passed in tag.
        """
        # Change state to tagging and save tag locally
        self.state = State.TAGGING
        self.tag = tag

        # Request the entire tag in your current view.
        tagset = {owner.id: tagset for owner, tagset in self.view.items()}
        tagset[self.id] = self.tag

        # Send the tag request RPC to each neighbor
        rpc = RequestTag(self.epoch, tagset, self)
        for neighbor in self.neighbors():
            self.send(neighbor, rpc)

    def send_append_entries(self, target=None):
        """
        Helper function to send append entries to quorum or a specific node.

        Note: fails silently if target is not in the neighbors list.
        """
        # ownership check
        if not self.state == State.OWNER:
            return

        # Go through follower list.
        for node, objs in self.nextIndex.iteritems():
            # Filter based on the target supplied.
            if target is not None and node != target:
                continue

            # Construct the entries, or empty for heartbeat
            # The tag contains the state of each item to be sent
            entries = defaultdict(list)
            tag = defaultdict(LogState)

            for obj, nidx in objs.items():
                # A rule directly from the Raft paper
                if self.log[obj].lastApplied >= nidx:
                    entries[obj] = self.log[obj][nidx:]

                # Compute the previous log index and term
                prevLogIndex = nidx - 1
                prevLogTerm = self.log[obj][prevLogIndex].term
                commitIndex = self.log[obj].commitIndex

                # Create the tag state
                tag[obj] = LogState(prevLogIndex, prevLogTerm, commitIndex)

            # Send the append entries message
            self.send(node, AppendEntries(self.epoch, self.id, tag, entries))

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_state_change(self):
        """
        Setting the state decides how the Tag node will interact.
        """

        # Do state specific tag modifications
        if self.state == State.READY:
            self.votes = None
            self.tag = None

            # Remove owner state
            self.nextIndex = None
            self.matchIndex = None

            # Also interrupt the heartbeat
            if self.heartbeat: self.heartbeat.stop()

        elif self.state == State.TAGGING:
            # Convert to tag acquisition/release
            self.epoch += 1

            # Create election and vote for self
            self.votes = Election([node.id for node in self.quorum()])
            self.votes.vote(self.id)

            # Also interrupt the heartbeat
            if self.heartbeat: self.heartbeat.stop()

        elif self.state == State.OWNER:

            # Create the next index and match index
            self.nextIndex = {
                node: {
                    obj: self.log[obj].lastApplied + 1
                    for obj in self.view[self]
                }
                for node in self.neighbors()
            }

            self.matchIndex = {
                node: {obj: 0
                       for obj in self.view[self]}
                for node in self.neighbors()
            }

        else:
            raise SimulationException(
                "Unknown Tag Replica State: {!r} set on {}".format(
                    state, self))

    def on_heartbeat_timeout(self):
        """
        Time to send a heartbeat message to all tags.
        """
        if not self.state == State.OWNER:
            return

        # Send heartbeat or aggregated writes
        self.send_append_entries()

    def on_session_timeout(self, started):
        """
        If the session times out then go ahead and release the tag.
        """
        duration = self.env.now - started

        self.sim.logger.info("session on {} terminated at {} ({} ms)".format(
            self.id, self.env.now, duration))

        self.sim.results.update('session length', (self.id, duration))

        self.session = None
        self.release()

    def on_request_tag_rpc(self, msg):
        """
        Respond to a request for a tag acquisition from a server.
        """
        rpc = msg.value
        accept = True

        # The requested epoch must be less than or greater than local.
        if rpc.epoch < self.epoch: accept = False

        # Ensure that no one else owns the tag in your current view.
        for candidate, tagset in rpc.tag.items():
            # Short circuit
            if not accept: break

            for tag in tagset:
                owner = self.find_owner(tag)
                if owner is not None and owner.id != candidate:
                    accept = False
                    break

        # Log the vote decision
        amsg = "accepted" if accept else "did not accept"
        lmsg = "{} {} tag [{}] for {}".format(
            self, amsg, ",".join(rpc.tag[rpc.candidate.id]), rpc.candidate.id)
        self.sim.logger.info(lmsg)

        # Send the vote response back to the tag requester
        return self.send(msg.source, TagResponse(self.epoch, accept))

    def on_tag_response_rpc(self, msg):
        """
        Handle the votes from tag requests to other nodes.
        """
        rpc = msg.value

        if self.state == State.TAGGING:
            # If the epoch is greater than the current epoch
            if rpc.epoch > self.epoch:
                # Retry the tag request
                self.epoch = rpc.epoch
                self.send_tag_request(self.tag)

                self.sim.logger.info("{} retrying tag request for {}".format(
                    self, self.tag))

                # Exit: no more work required!
                return

            # Update the current election
            self.votes.vote(msg.source.id, rpc.accept)
            if self.votes.has_passed():

                # Update our local tag and become owner.
                if self.tag:
                    self.state = State.OWNER
                    self.view[self] = set(self.tag)
                else:
                    self.state = State.READY

                # Send out the ownership change append entries
                self.send_append_entries()

                # Log the new tag owner
                self.sim.logger.info("{} tag goes to: {}".format(
                    self, self.view[self]))

                # Record tag length over time
                self.sim.results.update(
                    'tag size', (self.id, self.env.now, len(self.view[self])))

        elif self.state in (State.READY, State.OWNER):
            # Ignore vote responses if we've changed our state
            return

        else:
            raise TagRPCException(
                "Tag request response in unknown state: '{}'".format(
                    self.state))

    def on_append_entries_rpc(self, msg):
        rpc = msg.value

        # reply false if the epoch < current epoch
        if rpc.epoch < self.epoch:
            self.sim.logger.info(
                "{} doesn't accept append entries in epoch {} for epoch {}".
                format(self, self.epoch, rpc.epoch))

            # Send back the request that you made originally.
            return self.send(
                msg.source,
                AEResponse(self.epoch, {obj: False
                                        for obj in rpc.tag.keys()}, rpc.tag,
                           Reason.EPOCH))

        # Update the view to match the view of the append entries
        # Update the epoch to match the rpc of the append entries
        self.view[msg.source] = set(rpc.tag.keys())
        if self.epoch < rpc.epoch:
            self.epoch = rpc.epoch

        # Now for each object in the RPC, perform Raft-like append entries.
        # The success tracking is a complete tracking for all objects, will
        # return false even if we need to update the log for only one thing.
        # We will reply back with a state object that has per-object details.
        success = defaultdict(bool)
        state = defaultdict(LogState)

        for obj, prev in rpc.tag.items():
            entries = rpc.entries[obj]
            objlog = self.log[obj]

            # If log doesn't contain an entry at prev index matching epoch.
            if objlog.lastApplied < prev.index or objlog[
                    prev.index].term != prev.epoch:

                # Perform the logging of this state failure
                if objlog.lastApplied < prev.index:
                    self.sim.logger.info(
                        "{} doesn't accept append to {} index {} where last applied is {}"
                        .format(self, obj, prev.index, objlog.lastApplied))
                else:
                    self.sim.logger.info(
                        "{} doesn't accept append to {} due to epoch mismatch: {} vs {}"
                        .format(self, obj, prev.epoch,
                                objlog[prev.index].term))

                # Mark that there is a problem and continue
                success[obj] = False
                state[obj] = LogState(objlog.lastApplied, objlog.lastTerm,
                                      objlog.lastCommit)
                continue

            # At this point the entries are accepted because of continue statements
            if entries:
                if objlog.lastApplied >= prev.index:
                    # If existing entry conflicts with a new one (same index, different epochs)
                    # Delete the existing entry and all that follow it.
                    if objlog[prev.index].term != prev.epoch:
                        objlog.truncate(prev.index)

                if objlog.lastApplied > prev.index:
                    # Better look into what's happening here!
                    raise TagRPCException(
                        "{} is possibly receiving duplicate append entries".
                        format(self))

                # Append any new entries not already in the log.
                for entry in entries:
                    # Add the entry/epoch to the log
                    objlog.append(*entry)

                    # Update the versions to compute visibilities
                    entry[0].update(self)

                # Log the last write from the append entries
                self.sim.logger.debug(
                    "appending {} entries to {} log on {} (term {}, commit {})"
                    .format(len(entries), obj, self, objlog.lastTerm,
                            objlog.commitIndex))

            # Update the commit index and save the state of the object.
            if prev.commit > objlog.commitIndex:
                objlog.commitIndex = min(prev.commit, objlog.lastApplied)

            success[obj] = True
            state[obj] = LogState(objlog.lastApplied, objlog.lastTerm,
                                  objlog.lastCommit)

        # Return the response back to the owner
        reason = Reason.OK if all(success.values()) else Reason.LOG
        return self.send(msg.source,
                         AEResponse(self.epoch, success, state, reason))

    def on_ae_response_rpc(self, msg):
        """
        Handles acknowledgment of append entries messages.
        """
        rpc = msg.value
        retry = False

        if self.state == State.OWNER:

            # Update state of followers in the tag group
            for obj, success in rpc.success.items():
                if success:
                    self.nextIndex[msg.source][obj] = rpc.tag[obj].index + 1
                    self.matchIndex[msg.source][obj] = rpc.tag[obj].index

                else:
                    # If the epoch is not the same, update accordingly.
                    if rpc.epoch > self.epoch:
                        self.epoch = rpc.epoch

                    # If the failure was because of the epoch, simply retry.
                    if rpc.reason == Reason.EPOCH:
                        retry = True

                    # Otherwise decrement the next index and to retry
                    elif rpc.reason == Reason.LOG:
                        self.nextIndex[msg.source][obj] -= 1
                        retry = True

                    else:
                        raise TagRPCException(
                            "Unknown append entries failure reason: {}".format(
                                rpc.reason))

            # Determine if we can commit the entry
            for obj, state in rpc.tag.items():
                log = self.log[obj]
                for n in xrange(log.lastApplied, log.commitIndex, -1):
                    commit = Election(self.matchIndex.keys())
                    for node, objs in self.matchIndex.items():
                        match = objs[obj]
                        commit.vote(node, match >= n)

                    if commit.has_passed() and log[n].term == self.epoch:
                        # Commit all versions from the last log to now.
                        for idx in xrange(log.commitIndex, n + 1):
                            if not log[idx].version: continue
                            log[idx].version.update(self, commit=True)

                        # Set the commit index and break
                        log.commitIndex = n
                        break

            # If retry, send append entries back to the source.
            if retry: self.send_append_entries(msg.source)

        elif self.state == State.TAGGING:
            # Determine if we need to retry the tagging again.
            if rpc.epoch > self.epoch:
                # Retry the tag request
                self.epoch = rpc.epoch
                self.send_tag_request(self.tag)

                self.sim.logger.info("{} retrying tag request for {}".format(
                    self, self.tag))

                return

        elif self.state == State.READY:
            # Ignore AE messages if we're not an owner anymore.
            return

        else:
            raise TagRPCException("Response in unknown state: '{}'".format(
                self.state))

    def on_remote_access(self, msg):
        """
        Handles remote writes to and from the replicas.
        """
        access = msg.value.access

        # Ensure that we own the object
        if not self.owns(access.name):
            return self.send(msg.source,
                             AccessResponse(self.epoch, False, access))

        # If we do own the object, then respond:
        method = {
            'read': self.read,
            'write': self.write,
        }[access.type]

        # Call the remote method with the access.
        method(access)

        return self.send(msg.source, AccessResponse(self.epoch, True, access))

    def on_access_response_rpc(self, msg):
        """
        Handles responses to remote accesses.
        """
        rpc = msg.value
        if rpc.success:
            rpc.access.complete()
Example #16
0
class FloatedRaftReplica(RaftReplica):
    def __init__(self, simulation, **kwargs):
        ## Initialize the replica
        super(FloatedRaftReplica, self).__init__(simulation, **kwargs)

        # Anti entropy settings
        self.ae_delay = kwargs.get('anti_entropy_delay', ANTI_ENTROPY_DELAY)
        self.ae_timer = None
        self.ae_cache = []

    @memoized
    def locations(self):
        """
        Returns all the locations in the network with Raft nodes.
        """
        return set(
            [node.location for node in self.neighbors(self.consistency)])

    def quorum(self):
        """
        Returns only nodes in the same location to do Raft consensus with.
        """

        # Filter only connections that are in the same consistency group
        for node in self.neighbors(self.consistency):
            if node.location == self.location:
                yield node

        # Don't forget to yield self!
        yield self

    def remotes(self, location=None):
        """
        Returns only nodes that are not in the same location to float writes
        to using anti-entropy. This method is only used by the leader.
        Can also specify a specific location to fetch the remotes for. Note
        that specifying your current location will not return nodes.
        """

        # Filter only connections that are in the same consistency group
        for node in self.neighbors(self.consistency):
            if node.location != self.location:
                if location is not None and node.location != location:
                    continue
                yield node

    def gossip(self):
        """
        Randomly select a neighbor and exchange information about the state
        of the latest entries in the log since the last anti-entropy delay.
        """

        # Gossip to one node at each location
        for location in self.locations:
            # Don't gossip to nodes in self!
            if location == self.location: continue

            # Select a random target to gossip to
            target = random.choice(list(self.remotes(location)))

            # Log the gossip that's happening
            self.sim.logger.debug("{} gossiping {} entries to {}".format(
                self, len(self.ae_cache), target))

            entries = tuple([
                Write(version.name, self, version) for version in self.ae_cache
            ])

            # Send all the values in the cache.
            self.send(target, Gossip(entries, len(self.ae_cache), -1))

        # Empty the cache on gossip
        self.ae_cache = []

        # Reset the anti-entropy timer
        self.ae_timer = Timer(self.env, self.ae_delay, self.gossip)
        self.ae_timer.start()

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_state_change(self):
        """
        Does the same stuff as super, but also - if leader; starts the anti
        entropy interval to do gossiping.
        """
        super(FloatedRaftReplica, self).on_state_change()

        if self.state in (State.FOLLOWER, State.CANDIDATE):
            if hasattr(self, 'ae_timer') and self.ae_timer is not None:
                # Cancel the anti-entropy timer.
                self.ae_timer.stop()
                self.ae_timer = None
        elif self.state == State.LEADER:
            self.ae_timer = Timer(self.env, self.ae_delay, self.gossip)
            self.ae_timer.start()
        elif self.state == State.READY:
            # This happens on the call to super, just ignore for now.
            pass
        else:
            raise SimulationException(
                "Unknown Floating Raft State: {!r} set on {}".format(
                    self.state, self))

    def on_gossip_rpc(self, message):
        """
        Handles the receipt of a gossip from another node. Expects multiple
        accesses (Write events) as entries. Goes through all and compares the
        versions, replying False only if there is an error or a conflict.
        """
        entries = message.value.entries

        # Go through the entries from the RPC and write to local cluster.
        for access in entries:
            access.version.gossiped = True
            self.write(access)

        # Should we return with what's in our cache?
        # Respond to the sender
        self.send(message.source, GossipResponse([], 0, True, -1))

    def on_response_rpc(self, message):
        """
        Just receives the acknowledgment of the response.
        """
        pass

    def on_ae_response_rpc(self, msg):
        """
        Does the same stuff that the super handler does, but also caches
        commits to gossip about them later!
        """
        rpc = msg.value

        if self.state == State.LEADER:

            if rpc.success:
                self.nextIndex[msg.source] = rpc.lastLogIndex + 1
                self.matchIndex[msg.source] = rpc.lastLogIndex

            else:
                # Decrement next index and retry append entries
                self.nextIndex[msg.source] -= 1
                self.send_append_entries(msg.source)

            # Decide if we can commit the entry
            for n in xrange(self.log.lastApplied, self.log.commitIndex, -1):
                commit = Election(self.matchIndex.keys())
                for k, v in self.matchIndex.iteritems():
                    commit.vote(k, v >= n)

                if commit.has_passed() and self.log[n][1] == self.currentTerm:
                    # Commit all versions from the last log entry to now.
                    for idx in xrange(self.log.commitIndex, n + 1):
                        if self.log[idx][0] is None: continue

                        # Cache the version to anti-entropy!
                        version = self.log[idx][0]
                        if not hasattr(version,
                                       'gossiped') or not version.gossiped:
                            self.ae_cache.append(version)

                        self.log[idx][0].update(self, commit=True)

                    # Set the commit index and break
                    self.log.commitIndex = n
                    break

        elif self.state == State.CANDIDATE:

            # Decide whether or not to step down.
            if rpc.term >= self.currentTerm:
                ## Become a follower
                self.state = State.FOLLOWER

                ## Log the failed election
                self.sim.logger.info(
                    "{} has stepped down as candidate".format(self))

                return

        elif self.state == State.FOLLOWER:
            # Ignore AE messages if we are the follower.
            return

        else:
            raise RaftRPCException(
                "Append entries response in unknown state: '{}'".format(
                    self.state))
Example #17
0
class EventualReplica(Replica):
    def __init__(self, simulation, **kwargs):
        super(EventualReplica, self).__init__(simulation, **kwargs)

        # Eventually consistent settings
        self.ae_delay = kwargs.get("anti_entropy_delay", AE_DELAY)
        self.n_neighbors = kwargs.get("num_neighbors", NEIGHBORS)

        # Deprecated
        self.do_gossip = kwargs.get("do_gossip", DO_GOSSIP)
        self.do_rumoring = kwargs.get("do_rumoring", DO_RUMORING)

        self.log = MultiObjectWriteLog()  # the write log of the replica
        self.timeout = None  # anti entropy timer

    ######################################################################
    ## Properties
    ######################################################################

    ######################################################################
    ## Core Methods (Replica API)
    ######################################################################

    def read(self, name, **kwargs):
        """
        Eventually consistent replicas simply return the latest version for
        the name that they have in their store. This easily could be stale or
        forked depending on writes elsewhere in the cluster.
        """
        # Create the read event using super.
        access = super(EventualReplica, self).read(name, **kwargs)

        # Record the number of attempts for the access
        if access.is_local_to(self):
            access.attempts += 1

        # Fetch the latest version from the log
        version = self.log.get_latest_version(access.name)

        # If version is None then we haven't read anything; bail!
        if version is None:
            return access.drop(empty=True)

        # Eventual nodes read locally and immediately, so complete the read.
        access.update(version, completed=True)

        # Log the access from this particular replica.
        access.log(self)

        return access

    def write(self, name, **kwargs):
        """
        Performs a write to the object with the given name by first creating
        the access event using super. Note that other access events can be
        passed into the write method in the case of remote writes.

        The access will define if the write is local or not.
        If local: write to the latest local version and complete.
        If remote: append write to log if latest version of object else error.

        After local vs. remote do the following:

        1. append the write to the log as (version, id)
        2. cache the latest access for gossip or rumoring
        3. update the version for visibility latency
        4. call the rumor handler

        Note this method can raise an error if not writing the latest version.
        """
        # Create the write event using super.
        access = super(EventualReplica, self).write(name, **kwargs)

        # Determine if the write is local or remote
        if access.is_local_to(self):
            # Record the number of attempts for the access
            access.attempts += 1

            # Fetch the latest version from the log
            latest = self.log.get_latest_version(access.name)

            # Perform the write
            if latest is None:
                version = namespace(access.name)(self)
            else:
                version = latest.nextv(self)

            # Update the access with the latest version and complete
            access.update(version, completed=True)

        else:

            # If there is no version, raise an exception
            if access.version is None:
                raise AccessError("Attempting a remote write on {} without a version!".format(self))

            # Save the version variable for use below
            version = access.version
            current = self.log.get_latest_version(access.name)

            # Ensure that the version is the latest.
            if current is not None and version <= current:
                raise AccessError("Attempting unordered write of {} after write of {}".format(version, current))

        # At this point we've dealt with local vs. remote
        # Append the latest version to the local data store
        self.log.append(version, 0)

        # Handle the access according to eventual rules
        version.update(self)  # Update the version to track visibility latency
        access.log(self)  # Log the access from this particular replica.
        self.rumor(access)  # Rumor the access on demand

        # Return the access for subclass access.
        return access

    def run(self):
        """
        The run method basically implements an anti-entropy timer.
        """
        while True:
            yield self.get_anti_entropy_timeout()

    ######################################################################
    ## Helper Methods
    ######################################################################

    def gossip(self):
        """
        Pairwise gossip protocol by randomly selecting a neighbor and
        exchanging information about the state of the latest objects in the
        cache since the last anti-entropy delay.

        TODO: how to gossip to strong consistency nodes?
        """
        # If gossiping is not allowed, forget about it.
        if not self.do_gossip:
            return

        # Perform pairwise anti-entropy sessions with n_neighbors
        for target in self.get_anti_entropy_neighbors():
            # Send the latest version of ALL objects.
            entries = [self.log.get_latest_version(name).access for name in self.log.namespace]
            gossip = Gossip(tuple(entries), len(entries))
            self.send(target, gossip)

    def rumor(self, access):
        """
        Performs on access rumor mongering
        """
        # if rumoring is not allowed, forget about it.
        if not self.do_rumoring:
            return

        # Send the access to n other neighbors (excluding the origin)
        for target in self.get_anti_entropy_neighbors():
            rumor = Rumor(access)
            self.send(target, rumor)

    def get_anti_entropy_timeout(self):
        """
        Creates the anti-entropy timeout.
        In the future this could be random timeout not fixed.
        """
        self.timeout = Timer(self.env, self.ae_delay, self.gossip)
        return self.timeout.start()

    def select_anti_entropy_neighbor(self):
        """
        Implements the anti-entropy neighbor selection policy. By default this
        is simply uniform random selection of all the eventual neighbors.
        """
        return random.choice(self.neighbors(self.consistency))

    def get_anti_entropy_neighbors(self):
        """
        Selects the neighbors to perform anti-entropy with.
        """
        for _ in xrange(self.n_neighbors):
            yield self.select_anti_entropy_neighbor()

    def update_forte_children(self, current, remote):
        """
        This unfortunately named method is a recursive function that updates
        all the children of the remote version with the new forte number and
        returns the newly correct current version.

        The idea here is that if the current version has a lower forte number
        then we should update the children of the remote (higher forte) in
        order to make sure that the latest branch is current.

        This method provides backpressure from Raft to Eventual.
        """

        def update_forte(forte, version, current):
            """
            Recursive update the forte number for a particular version.
            """
            # Update all the version's children with its forte number.
            for child in version.children:
                # Only update children that are in the current log.
                if child in self.log:
                    # Update child forte to parent and detect current
                    child.forte = forte
                    if child > current:
                        current = child

                # Recurse on grandchildren
                current = update_forte(forte, child, current)

            # Return the maximal version (using forte numbers) discovered.
            return current

        # This function only needs be called if we're in federated versioning.
        if settings.simulation.versioning != "federated":
            return current

        # If the current is greater than the remote, return it.
        if current is None or current >= remote:
            return current

        # Check the forte number on the remote and update the children.
        if remote.forte > current.forte:
            strong = update_forte(remote.forte, remote, current)
            if strong > current:
                # Put the strong version at the end of the log and return it
                # as the new current version (or latest for this object)
                if strong in self.log:
                    self.log.remove(strong)
                    self.log.append(strong, strong.forte)
                    return strong
                else:
                    # This really shouldn't happen?!
                    self.sim.logger.warning("Attempting to move {} to end when not in log!")

        # Last resort, return the current version.
        return current

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_gossip_rpc(self, message):
        """
        Handles the receipt of a gossip from another node. Expects multiple
        accesses (Write events) as entries. Goes through all and compares the
        versions, replying False only if there is an error or a conflict.
        """
        entries = message.value.entries
        updates = []

        # Go through the entries from the RPC and update log
        for access in entries:
            # Get the latest version from the log then update with forte
            current = self.log.get_latest_version(access.name)
            current = self.update_forte_children(current, access.version)

            # If the access is greater than our current version, write it!
            if current is None or access.version > current:
                self.write(access)

            # Is the the remote behind us? If so, send the latest version!
            elif access.version < current:
                updates.append(current.access)

            else:
                # Presumably the version are equal, so do nothing.
                continue

        # Success here just means whether or not we're responding with updates
        success = True if updates else False

        # Respond to the sender with the latest versions from our log
        self.send(message.source, GossipResponse(updates, len(updates), success))

    def on_gossip_response_rpc(self, message):
        """
        Handles the response to pairwise gossiping, updating entries from the
        responder's cache to the local log and latest version cache.
        """
        entries = message.value.entries

        for access in entries:
            current = self.log.get_latest_version(access.name)
            current = self.update_forte_children(current, access.version)

            # This is a new version or a later version than our current.
            if current is None or access.version > current:
                self.write(access)

    def on_rumor_rpc(self, message):
        """
        Handles the rumor message from the originator of the rumor.
        """
        access = message.value.access
        current = self.log.get_latest_version(access.name)
        current = self.update_forte_children(current, access.version)

        # Is the rumored version later than our current?
        if current is None or access.version > current:
            # Write the access which will rumor it out again
            self.write(access)

            # Respond True to the origin of the rumor
            response = RumorResponse(None, True)

        elif access.version < current:
            # Respond False to the origin with the later version
            response = RumorResponse(current.access, False)

        else:
            # Simply acknowledge receipt
            response = RumorResponse(None, True)

        # Send the response back to the source
        self.send(message.source, response)

    def on_rumor_response_rpc(self, message):
        """
        Handles the rumor acknowledgment
        """
        response = message.value
        if not response.success:
            # This means that a later value has come in!
            current = self.log.get_latest_version(response.access.name)
            current = self.update_forte_children(current, access.version)

            # If their response is later than our version, write it.
            if current is None or response.access.version > current:
                self.write(response.access)
Example #18
0
class RaftReplica(ConsensusReplica):
    def __init__(self, simulation, **kwargs):
        ## Initialize the replica
        super(RaftReplica, self).__init__(simulation, **kwargs)

        ## Initialize Raft Specific settings
        self.state = State.FOLLOWER
        self.currentTerm = 0
        self.votedFor = None
        self.log = MultiObjectWriteLog()
        self.cache = {}

        ## Policies
        self.read_policy = ReadPolicy.get(kwargs.get("read_policy", READ_POLICY))
        self.aggregate_writes = kwargs.get("aggregate_writes", AGGREGATE_WRITES)

        ## Timers for work
        eto = kwargs.get("election_timeout", ELECTION_TIMEOUT)
        hbt = kwargs.get("heartbeat_interval", HEARTBEAT_INTERVAL)

        self.timeout = ElectionTimer.fromReplica(self, eto)
        self.heartbeat = Timer(self.env, hbt, self.on_heartbeat_timeout)

        ## Leader state
        self.nextIndex = None
        self.matchIndex = None

    ######################################################################
    ## Core Methods (Replica API)
    ######################################################################

    def recv(self, event):
        """
        Before dispatching the message to an RPC specific handler, there are
        some message-wide checks that need to occur. In this case the term
        must be inspected and if the replica is behind, become follower.
        """
        message = event.value
        rpc = message.value

        # If RPC request or response contains term > currentTerm
        # Set currentTerm to term and convert to follower.
        if rpc.term > self.currentTerm:
            self.state = State.FOLLOWER
            self.currentTerm = rpc.term

        # Record the received message and dispatch to event handler
        return super(RaftReplica, self).recv(event)

    def read(self, name, **kwargs):
        """
        Raft nodes perform a local read of the most recent commited version
        for the name passed in. Because the committed version could be stale
        (a new version is still waiting for 2 phase commit) a fork is possible
        but the Raft group will maintain full linearizability.
        """
        # Create the read event using super.
        access = super(RaftReplica, self).read(name, **kwargs)

        # Record the number of attempts for the access
        if access.is_local_to(self):
            access.attempts += 1

        # NOTE: Formerly, this was ALWAYS read commit not read latest, now
        # it is set by the read policy on the replica. We previously noted that
        # read committed was one of the key differences from eventual.
        version = self.read_via_policy(access.name)

        # If the version is None, that we haven't read anything!
        if version is None:
            return access.drop(empty=True)

        # Because this is a local read committed, complete the read.
        access.update(version, completed=True)

        # Log the access from this particular replica.
        access.log(self)

        return access

    def write(self, name, **kwargs):
        """
        The write can be initiated on any replica server, including followers.
        Step one is to create the access event using super, which will give us
        the ability to detect local vs. remote writes.

        If the write is local:
        - create a new version from the latest write.
        - if follower: send a RemoteWrite with new version to the leader (write latency)
                store a cache copy so that followers can read their own writes.
                cached copy of the write goes away on AppendEntries.
        - if leader: append to log and complete (no leader latency)

        If the write is remote:
        - if follower: log warning and forward to leader
        - if leader: append to log but do not complete (complete at local)

        Check the committed vs. latest new versions.

        After local vs. remote do the following:

        1. update the version for visibility latency
        2. if leader send append entries
        """
        access = super(RaftReplica, self).write(name, **kwargs)

        # Determine if the write is local or remote
        if access.is_local_to(self):
            # Record the number of attempts for the access
            access.attempts += 1

            # Write a new version to the latest read by policy
            version = self.write_via_policy(access.name)

            # Update the access with the latest version
            access.update(version)

            # Log the access from this particular replica.
            access.log(self)

            if self.state == State.LEADER:
                # Append to log and complete if leader and local
                self.append_via_policy(access, complete=True)

            else:
                # Store the version in the cache and send remote write.
                self.cache[access.name] = version
                return self.send_remote_write(access)

        else:
            # Log the access from this particular replica.
            access.log(self)

            # If there is no version, raise an exception
            if access.version is None:
                raise AccessError("Attempting a remote write on {} without a version!".format(self))

            # Save the version variable for use below.
            version = access.version

            if self.state == State.LEADER:
                # Append to log but do not complete since its remote
                self.append_via_policy(access, complete=False)

            else:
                # Remote write occurred from client to a follower
                self.sim.logger.info("remote write on follower node: {}".format(self))

                # Store the version in the cache and send remote write.
                self.cache[access.name] = version
                return self.send_remote_write(access)

        # At this point we've dealt with local vs. remote, we should be the leader
        assert self.state == State.LEADER

        # Update the version to track visibility latency
        forte = True if settings.simulation.forte_on_append else False
        version.update(self, forte=forte)

        # Now do AppendEntries
        # Also interrupt the heartbeat since we just sent AppendEntries
        if not self.aggregate_writes:
            self.send_append_entries()
            self.heartbeat.stop()

        return access

    def run(self):
        """
        Implements the Raft consensus protocol and elections.
        """
        while True:
            if self.state in {State.FOLLOWER, State.CANDIDATE}:
                yield self.timeout.start()

            elif self.state == State.LEADER:
                yield self.heartbeat.start()

            else:
                raise SimulationException("Unknown Raft State: {!r} on {}".format(self.state, self))

    ######################################################################
    ## Helper Methods
    ######################################################################

    def send_append_entries(self, target=None):
        """
        Helper function to send append entries to quorum or a specific node.

        Note: fails silently if target is not in the neighbors list.
        """
        # Leader check
        if not self.state == State.LEADER:
            return

        # Go through follower list.
        for node, nidx in self.nextIndex.iteritems():
            # Filter based on the target supplied.
            if target is not None and node != target:
                continue

            # Construct the entries, or empty for heartbeat
            entries = []
            if self.log.lastApplied >= nidx:
                entries = self.log[nidx:]

            # Compute the previous log index and term
            prevLogIndex = nidx - 1
            prevLogTerm = self.log[prevLogIndex].term

            # Send the heartbeat message
            self.send(
                node, AppendEntries(self.currentTerm, self.id, prevLogIndex, prevLogTerm, entries, self.log.commitIndex)
            )

    def send_remote_write(self, access):
        """
        Helper function to send a remote write from a follower to leader.
        """
        # Find the leader to perform the remote write.
        leader = self.get_leader_node()

        # If not leader, then drop the write
        if not leader:
            self.sim.logger.info("no leader: dropped write at {}".format(self))

            return access.drop()

        # Send the remote write to the leader
        self.send(leader, RemoteWrite(self.currentTerm, access))

        return access

    def get_leader_node(self):
        """
        Searches for the leader amongst the neighbors. Raises an exception if
        there are multiple leaders, which is an extreme edge case.
        """
        leaders = [node for node in self.quorum() if node.state == State.LEADER]

        if len(leaders) > 1:
            raise SimulationException("MutipleLeaders?!")
        elif len(leaders) < 1:
            return None
        else:
            return leaders[0]

    def read_via_policy(self, name):
        """
        This method returns a version from either the log or the cache
        according to the read policy set on the replica server as follows:

            - COMMIT: return the latest commited version (ignoring cache)
            - LATEST: return latest version in log or in cache

        This method raises an exception on bad read policies.
        """

        # If the policy is read committed, return the latest committed version
        if self.read_policy == ReadPolicy.COMMIT:
            return self.log.get_latest_commit(name)

        # If the policy is latest, read the latest and compare to cache.
        if self.read_policy == ReadPolicy.LATEST:
            # Get the latest version from the log (committed or not)
            version = self.log.get_latest_version(name)

            # If name in the cache and the cache version is greater, return it.
            if name in self.cache and version is not None:
                if self.cache[name] > version:
                    return self.cache[name]

            # Return the latest version
            return version

        # If we've reached this point, we don't know what to do!
        raise SimulationException("Unknown read policy!")

    def write_via_policy(self, name):
        """
        This method returns a new version incremented from either from the
        log or from the cache according to the read policy. It also handles
        any "new" writes, e.g. to objects that haven't been written yet.
        """
        # Fetch the version from the log or the cache according to the
        # read policy. This implements READ COMMITTED/READ LATEST
        latest = self.read_via_policy(name)

        # Perform the write
        if latest is None:
            return namespace(name)(self)

        return latest.nextv(self)

    def append_via_policy(self, access, complete=False):
        """
        This method is the gatekeeper for the log and can implement policies
        like "don't admit forks". It must drop the access if it doesn't meet
        the policy, and complete it if specified.

        NOTE: This is a leader-only method (followers have entries appended
        to their logs via AppendEntries) and will raise an exception if the
        node is not the leader.
        """
        if self.state != State.LEADER:
            raise RaftRPCException("Append via policies called on a follower replica!")

        # The default policy is just append anything
        # NOTE: subclasses (as in Federated) can modify this
        self.log.append(access.version, self.currentTerm)

        # Complete the access if specified by the caller.
        if complete:
            access.complete()

        # Indicate that we've successfully appended to the log
        return True

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_state_change(self):
        """
        When the state on a replica changes the internal state of the replica
        must also change, particularly the properties that define how the node
        interacts with RPC messages and client reads/writes.
        """
        if self.state in (State.FOLLOWER, State.CANDIDATE):
            self.votedFor = None
            self.nextIndex = None
            self.matchIndex = None
        elif self.state == State.CANDIDATE:
            pass
        elif self.state == State.LEADER:
            self.nextIndex = {node: self.log.lastApplied + 1 for node in self.quorum() if node != self}
            self.matchIndex = {node: 0 for node in self.quorum() if node != self}
        elif self.state == State.READY:
            # This happens on the call to super, just ignore for now.
            pass
        else:
            raise SimulationException("Unknown Raft State: {!r} set on {}".format(self.state, self))

    def on_heartbeat_timeout(self):
        """
        Callback for when a heartbeat timeout occurs, for AppendEntries RPC.
        """
        if not self.state == State.LEADER:
            return

        # Send heartbeat or aggregated writes
        self.send_append_entries()

    def on_election_timeout(self):
        """
        Callback for when an election timeout occurs, e.g. become candidate.
        """
        # Set state to candidate
        self.state = State.CANDIDATE

        # Create Election and vote for self
        self.currentTerm += 1
        self.votes = Election([node.id for node in self.quorum()])
        self.votes.vote(self.id)
        self.votedFor = self.id

        # Inform the rest of the quorum you'd like their vote.
        rpc = RequestVote(self.currentTerm, self.id, self.log.lastApplied, self.log.lastTerm)

        for follower in self.quorum():
            if follower == self:
                continue
            self.send(follower, rpc)

        # Log the newly formed candidacy
        self.sim.logger.info("{} is now a leader candidate".format(self))

    def on_request_vote_rpc(self, msg):
        """
        Callback for RequestVote RPC call.
        """
        rpc = msg.value

        if rpc.term >= self.currentTerm:
            if self.votedFor is None or self.votedFor == rpc.candidateId:
                if self.log.as_up_to_date(rpc.lastLogTerm, rpc.lastLogIndex):

                    self.sim.logger.info("{} voting for {}".format(self, rpc.candidateId))

                    self.timeout.stop()
                    self.votedFor = rpc.candidateId
                    return self.send(msg.source, VoteResponse(self.currentTerm, True))

        return self.send(msg.source, VoteResponse(self.currentTerm, False))

    def on_vote_response_rpc(self, msg):
        """
        Callback for AppendEntries and RequestVote RPC response.
        """
        rpc = msg.value

        if self.state == State.CANDIDATE:

            # Update the current election
            self.votes.vote(msg.source.id, rpc.voteGranted)
            if self.votes.has_passed():
                ## Become the leader
                self.state = State.LEADER
                self.timeout.stop()

                ## Send the leadership change append entries
                self.send_append_entries()

                ## Log the new leader
                self.sim.logger.info("{} has become raft leader".format(self))

            return

        elif self.state in (State.FOLLOWER, State.LEADER):
            # Ignore vote responses if we've already been elected.
            return

        else:
            raise RaftRPCException("Vote response in unknown state: '{}'".format(self.state))

    def on_append_entries_rpc(self, msg):
        """
        Callback for the AppendEntries RPC call.
        """
        rpc = msg.value

        # Stop the election timeout
        self.timeout.stop()

        # Reply false if term < current term
        if rpc.term < self.currentTerm:
            self.sim.logger.info("{} doesn't accept write on term {}".format(self, self.currentTerm))
            return self.send(msg.source, AEResponse(self.currentTerm, False, self.log.lastApplied, self.log.lastCommit))

        # Reply false if log doesn't contain an entry at prevLogIndex whose
        # term matches previous log term.
        if self.log.lastApplied < rpc.prevLogIndex or self.log[rpc.prevLogIndex][1] != rpc.prevLogTerm:
            if self.log.lastApplied < rpc.prevLogIndex:

                self.sim.logger.info(
                    "{} doesn't accept write on index {} where last applied is {}".format(
                        self, rpc.prevLogIndex, self.log.lastApplied
                    )
                )
            else:
                self.sim.logger.info(
                    "{} doesn't accept write for term mismatch {} vs {}".format(
                        self, rpc.prevLogTerm, self.log[rpc.prevLogIndex][1]
                    )
                )

            return self.send(msg.source, AEResponse(self.currentTerm, False, self.log.lastApplied, self.log.lastCommit))

        # At this point AppendEntries RPC is accepted
        if rpc.entries:
            if self.log.lastApplied >= rpc.prevLogIndex:
                # If existing entry conflicts with new one (same index, different terms)
                # Delete the existing entry and all that follow it.
                if self.log[rpc.prevLogIndex][1] != rpc.prevLogTerm:
                    self.log.truncate(rpc.prevLogIndex)

            if self.log.lastApplied > rpc.prevLogIndex:
                # Otherwise this could be a message that is sent again
                # raise RaftRPCException(
                #     "{} is possibly receiving a duplicate append entries!".format(self)
                # )
                self.sim.logger.warn("{} is possibly receiving a duplicate append entries!".format(self))
                return self.send(
                    msg.source, AEResponse(self.currentTerm, True, self.log.lastApplied, self.log.lastCommit)
                )

            # Append any new entries not already in the log.
            for entry in rpc.entries:
                # Add the entry/term to the log
                self.log.append(*entry)
                self.sim.logger.debug("appending {} to {} on {}".format(entry[0], entry[1], self))

                # Update the versions to compute visibilities
                entry[0].update(self)

            # Log the last write from the append entries.
            self.sim.logger.debug(
                "{} writes {} at idx {} (term {}, commit {})".format(
                    self, self.log.lastVersion, self.log.lastApplied, self.log.lastTerm, self.log.commitIndex
                )
            )

        # If leaderCommit > commitIndex, update commit Index
        if rpc.leaderCommit > self.log.commitIndex:
            self.log.commitIndex = min(rpc.leaderCommit, self.log.lastApplied)

        # Return success response.
        return self.send(msg.source, AEResponse(self.currentTerm, True, self.log.lastApplied, self.log.lastCommit))

    def on_ae_response_rpc(self, msg):
        """
        Handles acknowledgment of append entries message.
        """
        rpc = msg.value

        if self.state == State.LEADER:

            if rpc.success:
                self.nextIndex[msg.source] = rpc.lastLogIndex + 1
                self.matchIndex[msg.source] = rpc.lastLogIndex

            else:
                # Decrement next index and retry append entries
                # Ensure to floor the nextIndex to 1 (the start of the log).
                nidx = self.nextIndex[msg.source] - 1
                self.nextIndex[msg.source] = max(nidx, 1)
                self.send_append_entries(msg.source)

            # Decide if we can commit the entry
            for n in xrange(self.log.lastApplied, self.log.commitIndex, -1):
                commit = Election(self.matchIndex.keys())
                for k, v in self.matchIndex.iteritems():
                    commit.vote(k, v >= n)

                if commit.has_passed() and self.log[n][1] == self.currentTerm:
                    # Commit all versions from the last log entry to now.
                    for idx in xrange(self.log.commitIndex, n + 1):
                        if self.log[idx][0] is None:
                            continue
                        forte = True if settings.simulation.forte_on_commit else False
                        self.log[idx][0].update(self, commit=True, forte=forte)

                    # Set the commit index and break
                    self.log.commitIndex = n
                    break

        elif self.state == State.CANDIDATE:

            # Decide whether or not to step down.
            if rpc.term >= self.currentTerm:
                ## Become a follower
                self.state = State.FOLLOWER

                ## Log the failed election
                self.sim.logger.info("{} has stepped down as candidate".format(self))

                return

        elif self.state == State.FOLLOWER:
            # Ignore AE messages if we are the follower.
            return

        else:
            raise RaftRPCException("Append entries response in unknown state: '{}'".format(self.state))

    def on_remote_write_rpc(self, message):
        """
        Unpacks the version from the remote write and initiates a local write.
        """

        # Write the access from the remote replica
        access = message.value.version
        self.write(access)

        # Check if the access was dropped (e.g. the write failed)
        success = not access.is_dropped()

        # Send the write response
        self.send(message.source, WriteResponse(self.currentTerm, success, access))

    def on_write_response_rpc(self, message):
        """
        Completes the write if the remote write was successful.
        """
        rpc = message.value
        if rpc.success:
            rpc.access.complete()
Example #19
0
class EventualReplica(Replica):

    def __init__(self, simulation, **kwargs):
        super(EventualReplica, self).__init__(simulation, **kwargs)

        # Eventually consistent settings
        self.ae_delay    = kwargs.get('anti_entropy_delay', AE_DELAY)
        self.n_neighbors = kwargs.get('num_neighbors', NEIGHBORS)

        # Deprecated
        self.do_gossip   = kwargs.get('do_gossip', DO_GOSSIP)
        self.do_rumoring = kwargs.get('do_rumoring', DO_RUMORING)

        self.log         = MultiObjectWriteLog() # the write log of the replica
        self.timeout     = None                  # anti entropy timer

    ######################################################################
    ## Properties
    ######################################################################

    ######################################################################
    ## Core Methods (Replica API)
    ######################################################################

    def read(self, name, **kwargs):
        """
        Eventually consistent replicas simply return the latest version for
        the name that they have in their store. This easily could be stale or
        forked depending on writes elsewhere in the cluster.
        """
        # Create the read event using super.
        access  = super(EventualReplica, self).read(name, **kwargs)

        # Record the number of attempts for the access
        if access.is_local_to(self): access.attempts += 1

        # Fetch the latest version from the log
        version = self.log.get_latest_version(access.name)

        # If version is None then we haven't read anything; bail!
        if version is None: return access.drop(empty=True)

        # Eventual nodes read locally and immediately, so complete the read.
        access.update(version, completed=True)

        # Log the access from this particular replica.
        access.log(self)

        return access

    def write(self, name, **kwargs):
        """
        Performs a write to the object with the given name by first creating
        the access event using super. Note that other access events can be
        passed into the write method in the case of remote writes.

        The access will define if the write is local or not.
        If local: write to the latest local version and complete.
        If remote: append write to log if latest version of object else error.

        After local vs. remote do the following:

        1. append the write to the log as (version, id)
        2. cache the latest access for gossip or rumoring
        3. update the version for visibility latency
        4. call the rumor handler

        Note this method can raise an error if not writing the latest version.
        """
        # Create the write event using super.
        access  = super(EventualReplica, self).write(name, **kwargs)

        # Determine if the write is local or remote
        if access.is_local_to(self):
            # Record the number of attempts for the access
            access.attempts += 1

            # Fetch the latest version from the log
            latest  = self.log.get_latest_version(access.name)

            # Perform the write
            if latest is None:
                version = namespace(access.name)(self)
            else:
                version = latest.nextv(self)

            # Update the access with the latest version and complete
            access.update(version, completed=True)

        else:

            # If there is no version, raise an exception
            if access.version is None:
                raise AccessError(
                    "Attempting a remote write on {} without a version!".format(self)
                )

            # Save the version variable for use below
            version = access.version
            current = self.log.get_latest_version(access.name)

            # Ensure that the version is the latest.
            if current is not None and version <= current:
                raise AccessError(
                    "Attempting unordered write of {} after write of {}".format(version, current)
                )

        # At this point we've dealt with local vs. remote
        # Append the latest version to the local data store
        self.log.append(version, 0)

        # Handle the access according to eventual rules
        version.update(self) # Update the version to track visibility latency
        access.log(self)     # Log the access from this particular replica.
        self.rumor(access)   # Rumor the access on demand

        # Return the access for subclass access.
        return access

    def run(self):
        """
        The run method basically implements an anti-entropy timer.
        """
        while True:
            yield self.get_anti_entropy_timeout()

    ######################################################################
    ## Helper Methods
    ######################################################################

    def gossip(self):
        """
        Pairwise gossip protocol by randomly selecting a neighbor and
        exchanging information about the state of the latest objects in the
        cache since the last anti-entropy delay.

        TODO: how to gossip to strong consistency nodes?
        """
        # If gossiping is not allowed, forget about it.
        if not self.do_gossip:
            return

        # Perform pairwise anti-entropy sessions with n_neighbors
        for target in self.get_anti_entropy_neighbors():
            # Send the latest version of ALL objects.
            entries = [
                self.log.get_latest_version(name).access
                for name in self.log.namespace
            ]
            gossip  = Gossip(tuple(entries), len(entries))
            self.send(target, gossip)

    def rumor(self, access):
        """
        Performs on access rumor mongering
        """
        # if rumoring is not allowed, forget about it.
        if not self.do_rumoring:
            return

        # Send the access to n other neighbors (excluding the origin)
        for target in self.get_anti_entropy_neighbors():
            rumor = Rumor(access)
            self.send(target, rumor)

    def get_anti_entropy_timeout(self):
        """
        Creates the anti-entropy timeout.
        In the future this could be random timeout not fixed.
        """
        self.timeout = Timer(self.env, self.ae_delay, self.gossip)
        return self.timeout.start()

    def select_anti_entropy_neighbor(self):
        """
        Implements the anti-entropy neighbor selection policy. By default this
        is simply uniform random selection of all the eventual neighbors.
        """
        return random.choice(self.neighbors(self.consistency))

    def get_anti_entropy_neighbors(self):
        """
        Selects the neighbors to perform anti-entropy with.
        """
        for _ in xrange(self.n_neighbors):
            yield self.select_anti_entropy_neighbor()

    def update_forte_children(self, current, remote):
        """
        This unfortunately named method is a recursive function that updates
        all the children of the remote version with the new forte number and
        returns the newly correct current version.

        The idea here is that if the current version has a lower forte number
        then we should update the children of the remote (higher forte) in
        order to make sure that the latest branch is current.

        This method provides backpressure from Raft to Eventual.
        """

        def update_forte(forte, version, current):
            """
            Recursive update the forte number for a particular version.
            """
            # Update all the version's children with its forte number.
            for child in version.children:
                # Only update children that are in the current log.
                if child in self.log:
                    # Update child forte to parent and detect current
                    child.forte = forte
                    if child > current: current = child

                # Recurse on grandchildren
                current = update_forte(forte, child, current)

            # Return the maximal version (using forte numbers) discovered.
            return current

        # This function only needs be called if we're in federated versioning.
        if settings.simulation.versioning != "federated":
            return current

        # If the current is greater than the remote, return it.
        if current is None or current >= remote: return current

        # Check the forte number on the remote and update the children.
        if remote.forte > current.forte:
            strong = update_forte(remote.forte, remote, current)
            if strong > current:
                # Put the strong version at the end of the log and return it
                # as the new current version (or latest for this object)
                if strong in self.log:
                    self.log.remove(strong)
                    self.log.append(strong, strong.forte)
                    return strong
                else:
                    # This really shouldn't happen?!
                    self.sim.logger.warning(
                        "Attempting to move {} to end when not in log!"
                    )

        # Last resort, return the current version.
        return current

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_gossip_rpc(self, message):
        """
        Handles the receipt of a gossip from another node. Expects multiple
        accesses (Write events) as entries. Goes through all and compares the
        versions, replying False only if there is an error or a conflict.
        """
        entries = message.value.entries
        updates = []

        # Go through the entries from the RPC and update log
        for access in entries:
            # Get the latest version from the log then update with forte
            current = self.log.get_latest_version(access.name)
            current = self.update_forte_children(current, access.version)

            # If the access is greater than our current version, write it!
            if current is None or access.version > current:
                self.write(access)

            # Is the the remote behind us? If so, send the latest version!
            elif access.version < current:
                updates.append(current.access)

            else:
                # Presumably the version are equal, so do nothing.
                continue

        # Success here just means whether or not we're responding with updates
        success = True if updates else False

        # Respond to the sender with the latest versions from our log
        self.send(message.source, GossipResponse(updates, len(updates), success))

    def on_gossip_response_rpc(self, message):
        """
        Handles the response to pairwise gossiping, updating entries from the
        responder's cache to the local log and latest version cache.
        """
        entries = message.value.entries

        for access in entries:
            current = self.log.get_latest_version(access.name)
            current = self.update_forte_children(current, access.version)

            # This is a new version or a later version than our current.
            if current is None or access.version > current:
                self.write(access)

    def on_rumor_rpc(self, message):
        """
        Handles the rumor message from the originator of the rumor.
        """
        access  = message.value.access
        current = self.log.get_latest_version(access.name)
        current = self.update_forte_children(current, access.version)

        # Is the rumored version later than our current?
        if current is None or access.version > current:
            # Write the access which will rumor it out again
            self.write(access)

            # Respond True to the origin of the rumor
            response = RumorResponse(None, True)

        elif access.version < current:
            # Respond False to the origin with the later version
            response = RumorResponse(current.access, False)

        else:
            # Simply acknowledge receipt
            response = RumorResponse(None, True)

        # Send the response back to the source
        self.send(message.source, response)

    def on_rumor_response_rpc(self, message):
        """
        Handles the rumor acknowledgment
        """
        response = message.value
        if not response.success:
            # This means that a later value has come in!
            current = self.log.get_latest_version(response.access.name)
            current = self.update_forte_children(current, access.version)

            # If their response is later than our version, write it.
            if current is None or response.access.version > current:
                self.write(response.access)
Example #20
0
    def write(self, name, **kwargs):
        """
        When a replica performs a write it needs to decide if it can write to
        the tag locally, can acquire a tag for this object, or if it has to do
        something else like drop, wait, or remote write.

        If the access is local:

            - if the replica owns the tag, append and complete
            - if someone else owns the tag then drop, wait, or remote
            - if no one owns the tag, then attempt to acquire it

        If access is remote:

            - if we own the tag, then append but do not complete (at local)
            - if someone else owns the tag, log and forward to owner
            - if no one owns the tag then respond false
        """
        # Create the read event using super.
        access = super(TagReplica, self).write(name, **kwargs)

        # Increase the session on access.
        self.handle_session()

        # Determine if the write is local or remote
        if access.is_local_to(self):
            # Record the number of attempts for the access
            access.attempts += 1

            # Fetch the latest version from the log.
            latest = self.log[access.name].lastVersion

            # Perform the write
            if latest is None:
                version = namespace(access.name)(self)
            else:
                version = latest.nextv(self)

            # Update the access with the latest version
            access.update(version)

        else:
            # If there is no version, raise an exception
            if access.version is None:
                raise AccessError(
                    "Attempting a remote write on {} without a version!".
                    format(self))

            # Save the version variable for use below.
            version = access.version

        # Log the access at this replica
        access.log(self)

        # Are we the owner of this tag?
        if self.owns(access.name):
            # Perform the append entries
            self.log[name].append(version, self.epoch)
            # Update the version to track visibility latency
            version.update(self)

            # Complete the access if it was local
            if access.is_local_to(self): access.complete()

            # Now do AppendEntries
            # Also interrupt the heartbeat since we just sent AppendEntries
            if not settings.simulation.aggregate_writes:
                self.send_append_entries()
                if self.heartbeat: self.heartbeat.stop()

            return access

        # Is there a different owner for the tag?
        owner = self.find_owner(name)
        if owner is not None:
            # Right now just drop the write on its face.
            self.sim.logger.info("ownership conflict: dropped {} at {}".format(
                access, self))
            return access.drop()

        # We're going to acquire the tag!
        else:
            # We're going to have some write latency, retry the write.
            retry = Timer(self.env, self.heartbeat_interval,
                          lambda: self.write(access)).start()

            # Request the ownership of the tag
            self.acquire(access.name)

        return access
Example #21
0
class TagReplica(ConsensusReplica):

    def __init__(self, simulation, **kwargs):
        ## Timers for work
        self.session_timeout    = kwargs.get('session_timeout', SESSION_TIMEOUT)
        self.heartbeat_interval = kwargs.get('heartbeat_interval', HEARTBEAT_INTERVAL)
        self.session   = None
        self.heartbeat = None

        ## Initialze the tag specific settings
        self.epoch  = 0
        self.log    = defaultdict(WriteLog)
        self.view   = defaultdict(set)

        ## Owner state
        self.nextIndex  = None
        self.matchIndex = None

        ## Initialize the replica
        super(TagReplica, self).__init__(simulation, **kwargs)
        self.state  = State.READY

    ######################################################################
    ## Core Methods (Replica API)
    ######################################################################

    def read(self, name, **kwargs):
        """
        When a tag replica performs a read it has to decide whether or not to
        read locally or to make a remote read across the cluster.

        Convert the read into an access, then check if we own the object.
        If we do, then return the latest commit.
        If we don't and no one else does either, attempt to acquire the tag.
        If we don't and someone else does then either drop, wait, or remote.

        Current implementation: #2, MR, no remote access.
        If someone else owns tag, reads are dropped.

        TODO: Remote vs Local Reads
        """
        # Create the read event using super.
        access = super(TagReplica, self).read(name, **kwargs)

        # Record the number of attempts for the access
        if access.is_local_to(self): access.attempts += 1

        # Increase the session on access.
        self.handle_session()

        # Are we the owner of this tag?
        if self.owns(access.name):
            # TODO: Change to last commit!
            version = self.log[access.name].lastVersion

            # If the version is None, bail since we haven't read anything
            if version is None: return access.drop(empty=True)

            # Update the version, complete the read, and log the access
            access.update(version, completed=True)
            access.log(self)

            # Return, we're done reading!
            return access

        # Is there a different owner for the tag?
        owner = self.find_owner(access.name)
        if owner is not None:
            # Right now just drop the read on its face.
            self.sim.logger.info(
                "ownership conflict: dropped {} at {}".format(access, self)
            )
            return access.drop()

        # We're going to acquire the tag!
        else:
            # Log the access from this particular replica.
            access.log(self)

            # We're going to have some read latency, retry the read.
            retry = Timer(
                self.env, self.heartbeat_interval, lambda: self.read(access)
            ).start()

            if access.attempts <= 1 and self.state != State.TAGGING:
                # Request the ownership of the tag
                self.acquire(access.name)

        return access

    def write(self, name, **kwargs):
        """
        When a replica performs a write it needs to decide if it can write to
        the tag locally, can acquire a tag for this object, or if it has to do
        something else like drop, wait, or remote write.

        If the access is local:

            - if the replica owns the tag, append and complete
            - if someone else owns the tag then drop, wait, or remote
            - if no one owns the tag, then attempt to acquire it

        If access is remote:

            - if we own the tag, then append but do not complete (at local)
            - if someone else owns the tag, log and forward to owner
            - if no one owns the tag then respond false
        """
        # Create the read event using super.
        access = super(TagReplica, self).write(name, **kwargs)

        # Increase the session on access.
        self.handle_session()

        # Determine if the write is local or remote
        if access.is_local_to(self):
            # Record the number of attempts for the access
            access.attempts += 1

            # Fetch the latest version from the log.
            latest = self.log[access.name].lastVersion

            # Perform the write
            if latest is None:
                version = namespace(access.name)(self)
            else:
                version = latest.nextv(self)

            # Update the access with the latest version
            access.update(version)

        else:
            # If there is no version, raise an exception
            if access.version is None:
                raise AccessError(
                    "Attempting a remote write on {} without a version!".format(self)
                )

            # Save the version variable for use below.
            version = access.version

        # Log the access at this replica
        access.log(self)

        # Are we the owner of this tag?
        if self.owns(access.name):
            # Perform the append entries
            self.log[name].append(version, self.epoch)
            # Update the version to track visibility latency
            version.update(self)

            # Complete the access if it was local
            if access.is_local_to(self): access.complete()

            # Now do AppendEntries
            # Also interrupt the heartbeat since we just sent AppendEntries
            if not settings.simulation.aggregate_writes:
                self.send_append_entries()
                if self.heartbeat: self.heartbeat.stop()

            return access

        # Is there a different owner for the tag?
        owner = self.find_owner(name)
        if owner is not None:
            # Right now just drop the write on its face.
            self.sim.logger.info(
                "ownership conflict: dropped {} at {}".format(access, self)
            )
            return access.drop()

        # We're going to acquire the tag!
        else:
            # We're going to have some write latency, retry the write.
            retry = Timer(
                self.env, self.heartbeat_interval, lambda: self.write(access)
            ).start()

            # Request the ownership of the tag
            self.acquire(access.name)

        return access

    def run(self):
        """
        We have to check in at every heartbeat interval. If we own a tag then
        send a heartbeat message, otherwise just keep quiescing.
        """
        while True:
            if self.state == State.OWNER:
                self.heartbeat = Timer(
                    self.env, self.heartbeat_interval, self.on_heartbeat_timeout
                )
                yield self.heartbeat.start()
            else:
                yield self.env.timeout(self.heartbeat_interval)

    ######################################################################
    ## Helper Methods
    ######################################################################

    def owns(self, name):
        """
        Returns True if the name is in the current view for that owner.
        """
        return name in self.view[self]

    def find_owner(self, name):
        """
        Looks up the owner of the name in the current view.
        Returns None if there is no owner fo the tag.
        """
        for owner, tag in self.view.items():
            if name in tag:
                return owner
        return None

    def acquire(self, tag):
        """
        Sends out the acquire tag RPC
        """
        # Construct the tag to send out
        if not isinstance(tag, (set, frozenset)):
            tag = frozenset([tag])

        # Make sure to request the tag we already have
        tag = frozenset(self.view[self] | tag)

        # Request tag with all current tags
        self.send_tag_request(tag)

        # Log the tag acquisition
        self.sim.logger.info(
            "{} is atempting to acquire tag {}".format(self, self.tag)
        )

    def release(self, tag=None):
        """
        Sends out the release tag RPC
        """
        # Release all currently held tags
        if tag is None: tag = self.view[self]

        # Construct the tag to send out (if specified)
        if not isinstance(tag, (set, frozenset)):
            tag = frozenset([tag])

        # Request the difference of the tags we already have
        tag = frozenset(self.view[self] - tag)

        # Request tag with all current tags
        self.send_tag_request(tag)

        # Log the tag release
        self.sim.logger.info(
            "{} is atempting to release tag {}".format(self, tag)
        )

    def handle_session(self):
        """
        Starts a session timer if one isn't running, otherwise resets the
        currently running session timer on an additional access.
        """
        if not self.session:
            self.session = Timer(
                self.env, self.session_timeout,
                partial(self.on_session_timeout, self.env.now)
            )
        else:
            self.session = self.session.reset()

    def get_log_state(self, tag=None):
        """
        Constructs a log state object for append entries responses, either
        for the current tag or simply the current view.
        """
        if tag is None:
            tag = [obj for view in self.view.values() for obj in view]

        return {
            obj: LogState(
                self.log[obj].lastApplied,
                self.log[obj].lastTerm,
                self.log[obj].commitIndex
            ) for obj in tag
        }

    def send_tag_request(self, tag):
        """
        Broadcasts a tag request for the passed in tag.
        """
        # Change state to tagging and save tag locally
        self.state = State.TAGGING
        self.tag = tag

        # Request the entire tag in your current view.
        tagset = {
            owner.id: tagset
            for owner, tagset in self.view.items()
        }
        tagset[self.id] = self.tag

        # Send the tag request RPC to each neighbor
        rpc = RequestTag(self.epoch, tagset, self)
        for neighbor in self.neighbors():
            self.send(neighbor, rpc)

    def send_append_entries(self, target=None):
        """
        Helper function to send append entries to quorum or a specific node.

        Note: fails silently if target is not in the neighbors list.
        """
        # ownership check
        if not self.state == State.OWNER:
            return

        # Go through follower list.
        for node, objs in self.nextIndex.iteritems():
            # Filter based on the target supplied.
            if target is not None and node != target:
                continue

            # Construct the entries, or empty for heartbeat
            # The tag contains the state of each item to be sent
            entries = defaultdict(list)
            tag = defaultdict(LogState)

            for obj, nidx in objs.items():
                # A rule directly from the Raft paper
                if self.log[obj].lastApplied >= nidx:
                    entries[obj] = self.log[obj][nidx:]

                # Compute the previous log index and term
                prevLogIndex = nidx - 1
                prevLogTerm  = self.log[obj][prevLogIndex].term
                commitIndex  = self.log[obj].commitIndex

                # Create the tag state
                tag[obj] = LogState(prevLogIndex, prevLogTerm, commitIndex)

            # Send the append entries message
            self.send(
                node, AppendEntries(
                    self.epoch, self.id, tag, entries
                )
            )

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_state_change(self):
        """
        Setting the state decides how the Tag node will interact.
        """

        # Do state specific tag modifications
        if self.state == State.READY:
            self.votes = None
            self.tag   = None

            # Remove owner state
            self.nextIndex  = None
            self.matchIndex = None

            # Also interrupt the heartbeat
            if self.heartbeat: self.heartbeat.stop()

        elif self.state == State.TAGGING:
            # Convert to tag acquisition/release
            self.epoch += 1

            # Create election and vote for self
            self.votes = Election([node.id for node in self.quorum()])
            self.votes.vote(self.id)

            # Also interrupt the heartbeat
            if self.heartbeat: self.heartbeat.stop()

        elif self.state == State.OWNER:

            # Create the next index and match index
            self.nextIndex = {
                node: {
                    obj: self.log[obj].lastApplied + 1
                    for obj in self.view[self]
                } for node in self.neighbors()
            }

            self.matchIndex = {
                node: {
                    obj: 0 for obj in self.view[self]
                } for node in self.neighbors()
            }

        else:
            raise SimulationException(
                "Unknown Tag Replica State: {!r} set on {}".format(state, self)
            )

    def on_heartbeat_timeout(self):
        """
        Time to send a heartbeat message to all tags.
        """
        if not self.state == State.OWNER:
            return

        # Send heartbeat or aggregated writes
        self.send_append_entries()

    def on_session_timeout(self, started):
        """
        If the session times out then go ahead and release the tag.
        """
        duration = self.env.now - started

        self.sim.logger.info(
            "session on {} terminated at {} ({} ms)".format(
                self.id, self.env.now, duration
            )
        )

        self.sim.results.update(
            'session length', (self.id, duration)
        )

        self.session = None
        self.release()

    def on_request_tag_rpc(self, msg):
        """
        Respond to a request for a tag acquisition from a server.
        """
        rpc = msg.value
        accept = True

        # The requested epoch must be less than or greater than local.
        if rpc.epoch < self.epoch: accept = False

        # Ensure that no one else owns the tag in your current view.
        for candidate, tagset in rpc.tag.items():
            # Short circuit
            if not accept: break

            for tag in tagset:
                owner = self.find_owner(tag)
                if owner is not None and owner.id != candidate:
                    accept = False
                    break

        # Log the vote decision
        amsg = "accepted" if accept else "did not accept"
        lmsg = "{} {} tag [{}] for {}".format(
            self, amsg, ",".join(rpc.tag[rpc.candidate.id]), rpc.candidate.id
        )
        self.sim.logger.info(lmsg)

        # Send the vote response back to the tag requester
        return self.send(
            msg.source, TagResponse(self.epoch, accept)
        )

    def on_tag_response_rpc(self, msg):
        """
        Handle the votes from tag requests to other nodes.
        """
        rpc = msg.value

        if self.state == State.TAGGING:
            # If the epoch is greater than the current epoch
            if rpc.epoch > self.epoch:
                # Retry the tag request
                self.epoch = rpc.epoch
                self.send_tag_request(self.tag)

                self.sim.logger.info(
                    "{} retrying tag request for {}".format(self, self.tag)
                )

                # Exit: no more work required!
                return

            # Update the current election
            self.votes.vote(msg.source.id, rpc.accept)
            if self.votes.has_passed():

                # Update our local tag and become owner.
                if self.tag:
                    self.state = State.OWNER
                    self.view[self] = set(self.tag)
                else:
                    self.state = State.READY

                # Send out the ownership change append entries
                self.send_append_entries()

                # Log the new tag owner
                self.sim.logger.info(
                    "{} tag goes to: {}".format(self, self.view[self])
                )

                # Record tag length over time
                self.sim.results.update(
                    'tag size', (self.id, self.env.now, len(self.view[self]))
                )

        elif self.state in (State.READY, State.OWNER):
            # Ignore vote responses if we've changed our state
            return

        else:
            raise TagRPCException(
                "Tag request response in unknown state: '{}'".format(self.state)
            )

    def on_append_entries_rpc(self, msg):
        rpc = msg.value

        # reply false if the epoch < current epoch
        if rpc.epoch < self.epoch:
            self.sim.logger.info(
                "{} doesn't accept append entries in epoch {} for epoch {}".format(
                    self, self.epoch, rpc.epoch
                )
            )

            # Send back the request that you made originally.
            return self.send(
                msg.source, AEResponse(
                    self.epoch,
                    {obj: False for obj in rpc.tag.keys()},
                    rpc.tag, Reason.EPOCH
                )
            )

        # Update the view to match the view of the append entries
        # Update the epoch to match the rpc of the append entries
        self.view[msg.source] = set(rpc.tag.keys())
        if self.epoch < rpc.epoch:
            self.epoch = rpc.epoch

        # Now for each object in the RPC, perform Raft-like append entries.
        # The success tracking is a complete tracking for all objects, will
        # return false even if we need to update the log for only one thing.
        # We will reply back with a state object that has per-object details.
        success = defaultdict(bool)
        state   = defaultdict(LogState)

        for obj, prev in rpc.tag.items():
            entries = rpc.entries[obj]
            objlog  = self.log[obj]

            # If log doesn't contain an entry at prev index matching epoch.
            if objlog.lastApplied < prev.index or objlog[prev.index].term != prev.epoch:

                # Perform the logging of this state failure
                if objlog.lastApplied < prev.index:
                    self.sim.logger.info(
                        "{} doesn't accept append to {} index {} where last applied is {}".format(
                            self, obj, prev.index, objlog.lastApplied
                        )
                    )
                else:
                    self.sim.logger.info(
                        "{} doesn't accept append to {} due to epoch mismatch: {} vs {}".format(
                            self, obj, prev.epoch, objlog[prev.index].term
                        )
                    )

                # Mark that there is a problem and continue
                success[obj] = False
                state[obj] = LogState(objlog.lastApplied, objlog.lastTerm, objlog.lastCommit)
                continue

            # At this point the entries are accepted because of continue statements
            if entries:
                if objlog.lastApplied >= prev.index:
                    # If existing entry conflicts with a new one (same index, different epochs)
                    # Delete the existing entry and all that follow it.
                    if objlog[prev.index].term != prev.epoch:
                        objlog.truncate(prev.index)

                if objlog.lastApplied > prev.index:
                    # Better look into what's happening here!
                    raise TagRPCException(
                        "{} is possibly receiving duplicate append entries".format(self)
                    )

                # Append any new entries not already in the log.
                for entry in entries:
                    # Add the entry/epoch to the log
                    objlog.append(*entry)

                    # Update the versions to compute visibilities
                    entry[0].update(self)

                # Log the last write from the append entries
                self.sim.logger.debug(
                    "appending {} entries to {} log on {} (term {}, commit {})".format(
                        len(entries), obj, self, objlog.lastTerm, objlog.commitIndex
                    )
                )

            # Update the commit index and save the state of the object.
            if prev.commit > objlog.commitIndex:
                objlog.commitIndex = min(prev.commit, objlog.lastApplied)

            success[obj] = True
            state[obj] = LogState(objlog.lastApplied, objlog.lastTerm, objlog.lastCommit)

        # Return the response back to the owner
        reason = Reason.OK if all(success.values()) else Reason.LOG
        return self.send(
            msg.source, AEResponse(self.epoch, success, state, reason)
        )

    def on_ae_response_rpc(self, msg):
        """
        Handles acknowledgment of append entries messages.
        """
        rpc = msg.value
        retry = False

        if self.state == State.OWNER:

            # Update state of followers in the tag group
            for obj, success in rpc.success.items():
                if success:
                    self.nextIndex[msg.source][obj] = rpc.tag[obj].index + 1
                    self.matchIndex[msg.source][obj] = rpc.tag[obj].index

                else:
                    # If the epoch is not the same, update accordingly.
                    if rpc.epoch > self.epoch:
                        self.epoch = rpc.epoch

                    # If the failure was because of the epoch, simply retry.
                    if rpc.reason == Reason.EPOCH:
                        retry = True

                    # Otherwise decrement the next index and to retry
                    elif rpc.reason == Reason.LOG:
                        self.nextIndex[msg.source][obj] -= 1
                        retry = True

                    else:
                        raise TagRPCException(
                            "Unknown append entries failure reason: {}".format(rpc.reason)
                        )

            # Determine if we can commit the entry
            for obj, state in rpc.tag.items():
                log = self.log[obj]
                for n in xrange(log.lastApplied, log.commitIndex, -1):
                    commit = Election(self.matchIndex.keys())
                    for node, objs in self.matchIndex.items():
                        match = objs[obj]
                        commit.vote(node, match >= n)

                    if commit.has_passed() and log[n].term == self.epoch:
                        # Commit all versions from the last log to now.
                        for idx in xrange(log.commitIndex, n+1):
                            if not log[idx].version: continue
                            log[idx].version.update(self, commit=True)

                        # Set the commit index and break
                        log.commitIndex = n
                        break

            # If retry, send append entries back to the source.
            if retry: self.send_append_entries(msg.source)


        elif self.state == State.TAGGING:
            # Determine if we need to retry the tagging again.
            if rpc.epoch > self.epoch:
                # Retry the tag request
                self.epoch = rpc.epoch
                self.send_tag_request(self.tag)

                self.sim.logger.info(
                    "{} retrying tag request for {}".format(self, self.tag)
                )

                return

        elif self.state == State.READY:
            # Ignore AE messages if we're not an owner anymore.
            return

        else:
            raise TagRPCException(
                "Response in unknown state: '{}'".format(self.state)
            )

    def on_remote_access(self, msg):
        """
        Handles remote writes to and from the replicas.
        """
        access = msg.value.access

        # Ensure that we own the object
        if not self.owns(access.name):
            return self.send(
                msg.source, AccessResponse(self.epoch, False, access)
            )

        # If we do own the object, then respond:
        method = {
            'read': self.read,
            'write': self.write,
        }[access.type]

        # Call the remote method with the access.
        method(access)

        return self.send(
            msg.source, AccessResponse(self.epoch, True, access)
        )

    def on_access_response_rpc(self, msg):
        """
        Handles responses to remote accesses.
        """
        rpc = msg.value
        if rpc.success:
            rpc.access.complete()
Example #22
0
class RaftReplica(ConsensusReplica):

    def __init__(self, simulation, **kwargs):
        ## Initialize the replica
        super(RaftReplica, self).__init__(simulation, **kwargs)

        ## Initialize Raft Specific settings
        self.state       = State.FOLLOWER
        self.currentTerm = 0
        self.votedFor    = None
        self.log         = MultiObjectWriteLog()
        self.cache       = {}

        ## Policies
        self.read_policy = ReadPolicy.get(kwargs.get('read_policy', READ_POLICY))
        self.aggregate_writes = kwargs.get('aggregate_writes', AGGREGATE_WRITES)

        ## Timers for work
        eto = kwargs.get('election_timeout', ELECTION_TIMEOUT)
        hbt = kwargs.get('heartbeat_interval', HEARTBEAT_INTERVAL)

        self.timeout     = ElectionTimer.fromReplica(self, eto)
        self.heartbeat   = Timer(self.env, hbt, self.on_heartbeat_timeout)

        ## Leader state
        self.nextIndex   = None
        self.matchIndex  = None

    ######################################################################
    ## Core Methods (Replica API)
    ######################################################################

    def recv(self, event):
        """
        Before dispatching the message to an RPC specific handler, there are
        some message-wide checks that need to occur. In this case the term
        must be inspected and if the replica is behind, become follower.
        """
        message = event.value
        rpc = message.value

        # If RPC request or response contains term > currentTerm
        # Set currentTerm to term and convert to follower.
        if rpc.term > self.currentTerm:
            self.state = State.FOLLOWER
            self.currentTerm = rpc.term

        # Record the received message and dispatch to event handler
        return super(RaftReplica, self).recv(event)

    def read(self, name, **kwargs):
        """
        Raft nodes perform a local read of the most recent commited version
        for the name passed in. Because the committed version could be stale
        (a new version is still waiting for 2 phase commit) a fork is possible
        but the Raft group will maintain full linearizability.
        """
        # Create the read event using super.
        access = super(RaftReplica, self).read(name, **kwargs)

        # Record the number of attempts for the access
        if access.is_local_to(self): access.attempts += 1

        # NOTE: Formerly, this was ALWAYS read commit not read latest, now
        # it is set by the read policy on the replica. We previously noted that
        # read committed was one of the key differences from eventual.
        version = self.read_via_policy(access.name)

        # If the version is None, that we haven't read anything!
        if version is None: return access.drop(empty=True)

        # Because this is a local read committed, complete the read.
        access.update(version, completed=True)

        # Log the access from this particular replica.
        access.log(self)

        return access

    def write(self, name, **kwargs):
        """
        The write can be initiated on any replica server, including followers.
        Step one is to create the access event using super, which will give us
        the ability to detect local vs. remote writes.

        If the write is local:
        - create a new version from the latest write.
        - if follower: send a RemoteWrite with new version to the leader (write latency)
                store a cache copy so that followers can read their own writes.
                cached copy of the write goes away on AppendEntries.
        - if leader: append to log and complete (no leader latency)

        If the write is remote:
        - if follower: log warning and forward to leader
        - if leader: append to log but do not complete (complete at local)

        Check the committed vs. latest new versions.

        After local vs. remote do the following:

        1. update the version for visibility latency
        2. if leader send append entries
        """
        access = super(RaftReplica, self).write(name, **kwargs)

        # Determine if the write is local or remote
        if access.is_local_to(self):
            # Record the number of attempts for the access
            access.attempts += 1

            # Write a new version to the latest read by policy
            version = self.write_via_policy(access.name)

            # Update the access with the latest version
            access.update(version)

            # Log the access from this particular replica.
            access.log(self)

            if self.state == State.LEADER:
                # Append to log and complete if leader and local
                self.append_via_policy(access, complete=True)

            else:
                # Store the version in the cache and send remote write.
                self.cache[access.name] = version
                return self.send_remote_write(access)

        else:
            # Log the access from this particular replica.
            access.log(self)

            # If there is no version, raise an exception
            if access.version is None:
                raise AccessError(
                    "Attempting a remote write on {} without a version!".format(self)
                )

            # Save the version variable for use below.
            version = access.version

            if self.state == State.LEADER:
                # Append to log but do not complete since its remote
                self.append_via_policy(access, complete=False)

            else:
                # Remote write occurred from client to a follower
                self.sim.logger.info(
                    "remote write on follower node: {}".format(self)
                )

                # Store the version in the cache and send remote write.
                self.cache[access.name] = version
                return self.send_remote_write(access)

        # At this point we've dealt with local vs. remote, we should be the leader
        assert self.state == State.LEADER

        # Update the version to track visibility latency
        forte = True if settings.simulation.forte_on_append else False
        version.update(self, forte=forte)

        # Now do AppendEntries
        # Also interrupt the heartbeat since we just sent AppendEntries
        if not self.aggregate_writes:
            self.send_append_entries()
            self.heartbeat.stop()

        return access

    def run(self):
        """
        Implements the Raft consensus protocol and elections.
        """
        while True:
            if self.state in {State.FOLLOWER, State.CANDIDATE}:
                yield self.timeout.start()

            elif self.state == State.LEADER:
                yield self.heartbeat.start()

            else:
                raise SimulationException(
                    "Unknown Raft State: {!r} on {}".format(self.state, self)
                )

    ######################################################################
    ## Helper Methods
    ######################################################################

    def send_append_entries(self, target=None):
        """
        Helper function to send append entries to quorum or a specific node.

        Note: fails silently if target is not in the neighbors list.
        """
        # Leader check
        if not self.state == State.LEADER:
            return

        # Go through follower list.
        for node, nidx in self.nextIndex.iteritems():
            # Filter based on the target supplied.
            if target is not None and node != target:
                continue

            # Construct the entries, or empty for heartbeat
            entries = []
            if self.log.lastApplied >= nidx:
                entries = self.log[nidx:]

            # Compute the previous log index and term
            prevLogIndex = nidx - 1
            prevLogTerm  = self.log[prevLogIndex].term

            # Send the heartbeat message
            self.send(
                node, AppendEntries(
                    self.currentTerm, self.id, prevLogIndex,
                    prevLogTerm, entries, self.log.commitIndex
                )
            )

    def send_remote_write(self, access):
        """
        Helper function to send a remote write from a follower to leader.
        """
        # Find the leader to perform the remote write.
        leader = self.get_leader_node()

        # If not leader, then drop the write
        if not leader:
            self.sim.logger.info(
                "no leader: dropped write at {}".format(self)
            )

            return access.drop()

        # Send the remote write to the leader
        self.send(
            leader, RemoteWrite(self.currentTerm, access)
        )

        return access

    def get_leader_node(self):
        """
        Searches for the leader amongst the neighbors. Raises an exception if
        there are multiple leaders, which is an extreme edge case.
        """
        leaders = [
            node for node in self.quorum() if node.state == State.LEADER
        ]

        if len(leaders) > 1:
            raise SimulationException("MutipleLeaders?!")
        elif len(leaders) < 1:
            return None
        else:
            return leaders[0]

    def read_via_policy(self, name):
        """
        This method returns a version from either the log or the cache
        according to the read policy set on the replica server as follows:

            - COMMIT: return the latest commited version (ignoring cache)
            - LATEST: return latest version in log or in cache

        This method raises an exception on bad read policies.
        """

        # If the policy is read committed, return the latest committed version
        if self.read_policy == ReadPolicy.COMMIT:
            return self.log.get_latest_commit(name)

        # If the policy is latest, read the latest and compare to cache.
        if self.read_policy == ReadPolicy.LATEST:
            # Get the latest version from the log (committed or not)
            version = self.log.get_latest_version(name)

            # If name in the cache and the cache version is greater, return it.
            if name in self.cache and version is not None:
                if self.cache[name] > version:
                    return self.cache[name]

            # Return the latest version
            return version

        # If we've reached this point, we don't know what to do!
        raise SimulationException("Unknown read policy!")

    def write_via_policy(self, name):
        """
        This method returns a new version incremented from either from the
        log or from the cache according to the read policy. It also handles
        any "new" writes, e.g. to objects that haven't been written yet.
        """
        # Fetch the version from the log or the cache according to the
        # read policy. This implements READ COMMITTED/READ LATEST
        latest = self.read_via_policy(name)

        # Perform the write
        if latest is None:
            return namespace(name)(self)

        return latest.nextv(self)

    def append_via_policy(self, access, complete=False):
        """
        This method is the gatekeeper for the log and can implement policies
        like "don't admit forks". It must drop the access if it doesn't meet
        the policy, and complete it if specified.

        NOTE: This is a leader-only method (followers have entries appended
        to their logs via AppendEntries) and will raise an exception if the
        node is not the leader.
        """
        if self.state != State.LEADER:
            raise RaftRPCException(
                "Append via policies called on a follower replica!"
            )

        # The default policy is just append anything
        # NOTE: subclasses (as in Federated) can modify this
        self.log.append(access.version, self.currentTerm)

        # Complete the access if specified by the caller.
        if complete:
            access.complete()

        # Indicate that we've successfully appended to the log
        return True

    ######################################################################
    ## Event Handlers
    ######################################################################

    def on_state_change(self):
        """
        When the state on a replica changes the internal state of the replica
        must also change, particularly the properties that define how the node
        interacts with RPC messages and client reads/writes.
        """
        if self.state in (State.FOLLOWER, State.CANDIDATE):
            self.votedFor    = None
            self.nextIndex   = None
            self.matchIndex  = None
        elif self.state == State.CANDIDATE:
            pass
        elif self.state == State.LEADER:
            self.nextIndex   = {node: self.log.lastApplied + 1 for node in self.quorum() if node != self}
            self.matchIndex  = {node: 0 for node in self.quorum() if node != self}
        elif self.state == State.READY:
            # This happens on the call to super, just ignore for now.
            pass
        else:
            raise SimulationException(
                "Unknown Raft State: {!r} set on {}".format(self.state, self)
            )

    def on_heartbeat_timeout(self):
        """
        Callback for when a heartbeat timeout occurs, for AppendEntries RPC.
        """
        if not self.state == State.LEADER:
            return

        # Send heartbeat or aggregated writes
        self.send_append_entries()

    def on_election_timeout(self):
        """
        Callback for when an election timeout occurs, e.g. become candidate.
        """
        # Set state to candidate
        self.state = State.CANDIDATE

        # Create Election and vote for self
        self.currentTerm += 1
        self.votes = Election([node.id for node in self.quorum()])
        self.votes.vote(self.id)
        self.votedFor = self.id

        # Inform the rest of the quorum you'd like their vote.
        rpc = RequestVote(
            self.currentTerm, self.id, self.log.lastApplied, self.log.lastTerm
        )

        for follower in self.quorum():
            if follower == self: continue
            self.send(
                follower, rpc
            )

        # Log the newly formed candidacy
        self.sim.logger.info(
            "{} is now a leader candidate".format(self)
        )

    def on_request_vote_rpc(self, msg):
        """
        Callback for RequestVote RPC call.
        """
        rpc = msg.value

        if rpc.term >= self.currentTerm:
            if self.votedFor is None or self.votedFor == rpc.candidateId:
                if self.log.as_up_to_date(rpc.lastLogTerm, rpc.lastLogIndex):

                    self.sim.logger.info(
                        "{} voting for {}".format(self, rpc.candidateId)
                    )

                    self.timeout.stop()
                    self.votedFor = rpc.candidateId
                    return self.send(
                        msg.source, VoteResponse(self.currentTerm, True)
                    )

        return self.send(
            msg.source, VoteResponse(self.currentTerm, False)
        )

    def on_vote_response_rpc(self, msg):
        """
        Callback for AppendEntries and RequestVote RPC response.
        """
        rpc = msg.value

        if self.state == State.CANDIDATE:

            # Update the current election
            self.votes.vote(msg.source.id, rpc.voteGranted)
            if self.votes.has_passed():
                ## Become the leader
                self.state = State.LEADER
                self.timeout.stop()

                ## Send the leadership change append entries
                self.send_append_entries()

                ## Log the new leader
                self.sim.logger.info(
                    "{} has become raft leader".format(self)
                )

            return

        elif self.state in (State.FOLLOWER, State.LEADER):
            # Ignore vote responses if we've already been elected.
            return

        else:
            raise RaftRPCException(
                "Vote response in unknown state: '{}'".format(self.state)
            )

    def on_append_entries_rpc(self, msg):
        """
        Callback for the AppendEntries RPC call.
        """
        rpc = msg.value

        # Stop the election timeout
        self.timeout.stop()

        # Reply false if term < current term
        if rpc.term < self.currentTerm:
            self.sim.logger.info("{} doesn't accept write on term {}".format(self, self.currentTerm))
            return self.send(
                msg.source, AEResponse(self.currentTerm, False, self.log.lastApplied, self.log.lastCommit)
            )

        # Reply false if log doesn't contain an entry at prevLogIndex whose
        # term matches previous log term.
        if self.log.lastApplied < rpc.prevLogIndex or self.log[rpc.prevLogIndex][1] != rpc.prevLogTerm:
            if self.log.lastApplied < rpc.prevLogIndex:

                self.sim.logger.info(
                    "{} doesn't accept write on index {} where last applied is {}".format(
                        self, rpc.prevLogIndex, self.log.lastApplied
                    )
                )
            else:
                self.sim.logger.info(
                    "{} doesn't accept write for term mismatch {} vs {}".format(
                        self, rpc.prevLogTerm, self.log[rpc.prevLogIndex][1]
                    )
                )

            return self.send(
                msg.source, AEResponse(self.currentTerm, False, self.log.lastApplied, self.log.lastCommit)
            )

        # At this point AppendEntries RPC is accepted
        if rpc.entries:
            if self.log.lastApplied >= rpc.prevLogIndex:
                # If existing entry conflicts with new one (same index, different terms)
                # Delete the existing entry and all that follow it.
                if self.log[rpc.prevLogIndex][1] != rpc.prevLogTerm:
                    self.log.truncate(rpc.prevLogIndex)

            if self.log.lastApplied > rpc.prevLogIndex:
                # Otherwise this could be a message that is sent again
                # raise RaftRPCException(
                #     "{} is possibly receiving a duplicate append entries!".format(self)
                # )
                self.sim.logger.warn(
                    "{} is possibly receiving a duplicate append entries!".format(self)
                )
                return self.send(msg.source, AEResponse(self.currentTerm, True, self.log.lastApplied, self.log.lastCommit))


            # Append any new entries not already in the log.
            for entry in rpc.entries:
                # Add the entry/term to the log
                self.log.append(*entry)
                self.sim.logger.debug(
                    "appending {} to {} on {}".format(entry[0], entry[1], self)
                )

                # Update the versions to compute visibilities
                entry[0].update(self)

            # Log the last write from the append entries.
            self.sim.logger.debug(
                "{} writes {} at idx {} (term {}, commit {})".format(
                self, self.log.lastVersion, self.log.lastApplied, self.log.lastTerm, self.log.commitIndex
            ))

        # If leaderCommit > commitIndex, update commit Index
        if rpc.leaderCommit > self.log.commitIndex:
            self.log.commitIndex = min(rpc.leaderCommit, self.log.lastApplied)

        # Return success response.
        return self.send(msg.source, AEResponse(self.currentTerm, True, self.log.lastApplied, self.log.lastCommit))

    def on_ae_response_rpc(self, msg):
        """
        Handles acknowledgment of append entries message.
        """
        rpc = msg.value

        if self.state == State.LEADER:

            if rpc.success:
                self.nextIndex[msg.source]  = rpc.lastLogIndex + 1
                self.matchIndex[msg.source] = rpc.lastLogIndex

            else:
                # Decrement next index and retry append entries
                # Ensure to floor the nextIndex to 1 (the start of the log).
                nidx = self.nextIndex[msg.source] - 1
                self.nextIndex[msg.source] = max(nidx, 1)
                self.send_append_entries(msg.source)

            # Decide if we can commit the entry
            for n in xrange(self.log.lastApplied, self.log.commitIndex, -1):
                commit = Election(self.matchIndex.keys())
                for k, v in self.matchIndex.iteritems():
                    commit.vote(k, v >= n)

                if commit.has_passed() and self.log[n][1] == self.currentTerm:
                    # Commit all versions from the last log entry to now.
                    for idx in xrange(self.log.commitIndex, n+1):
                        if self.log[idx][0] is None: continue
                        forte = True if settings.simulation.forte_on_commit else False
                        self.log[idx][0].update(self, commit=True, forte=forte)

                    # Set the commit index and break
                    self.log.commitIndex = n
                    break

        elif self.state == State.CANDIDATE:

            # Decide whether or not to step down.
            if rpc.term >= self.currentTerm:
                ## Become a follower
                self.state = State.FOLLOWER

                ## Log the failed election
                self.sim.logger.info(
                    "{} has stepped down as candidate".format(self)
                )

                return

        elif self.state == State.FOLLOWER:
            # Ignore AE messages if we are the follower.
            return

        else:
            raise RaftRPCException(
                "Append entries response in unknown state: '{}'".format(self.state)
            )

    def on_remote_write_rpc(self, message):
        """
        Unpacks the version from the remote write and initiates a local write.
        """

        # Write the access from the remote replica
        access = message.value.version
        self.write(access)

        # Check if the access was dropped (e.g. the write failed)
        success = not access.is_dropped()

        # Send the write response
        self.send(message.source, WriteResponse(self.currentTerm, success, access))

    def on_write_response_rpc(self, message):
        """
        Completes the write if the remote write was successful.
        """
        rpc = message.value
        if rpc.success:
            rpc.access.complete()