def __init__(self, provider: ViewChangerDataProvider, timer: TimerService): self.provider = provider self._timer = timer self.inBox = deque() self.outBox = deque() self.inBoxRouter = Router( (InstanceChange, self.process_instance_change_msg) ) self.instance_changes = InstanceChangeProvider(self.config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db=self.provider.node_status_db) self.previous_view_no = None # Action for _schedule instanceChange messages self.instance_change_action = None # Count of instance change rounds self.instance_change_rounds = 0 # Time for view_change_starting self.start_view_change_ts = 0 # Force periodic view change if enabled in config force_view_change_freq = self.config.ForceViewChangeFreq if force_view_change_freq > 0: RepeatingTimer(self._timer, force_view_change_freq, self.on_master_degradation) # Start periodic freshness check state_freshness_update_interval = self.config.STATE_FRESHNESS_UPDATE_INTERVAL if state_freshness_update_interval > 0: RepeatingTimer(self._timer, state_freshness_update_interval, self.check_freshness)
def __init__(self, provider: ViewChangerDataProvider, timer: TimerService): self.provider = provider self._timer = timer self.pre_vc_strategy = None self._view_no = 0 # type: int self.inBox = deque() self.outBox = deque() self.inBoxRouter = Router( (InstanceChange, self.process_instance_change_msg), (ViewChangeDone, self.process_vchd_msg), (FutureViewChangeDone, self.process_future_view_vchd_msg)) self.instance_changes = InstanceChangeProvider( self.config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db=self.provider.node_status_db) # Tracks if other nodes are indicating that this node is in lower view # than others. Keeps a map of view no to senders # TODO: Consider if sufficient ViewChangeDone for 2 different (and # higher views) are received, should one view change be interrupted in # between. self._next_view_indications = {} self._view_change_in_progress = False self.pre_view_change_in_progress = False self.previous_view_no = None self.previous_master_primary = None self.set_defaults() self.initInsChngThrottling() # Action for _schedule instanceChange messages self.instance_change_action = None # Count of instance change rounds self.instance_change_rounds = 0 # Time for view_change_starting self.start_view_change_ts = 0 # Last successful viewNo. # In some cases view_change process can be uncompleted in time. # In that case we want to know, which viewNo was successful (last completed view_change) self.last_completed_view_no = 0 # Force periodic view change if enabled in config force_view_change_freq = self.config.ForceViewChangeFreq if force_view_change_freq > 0: RepeatingTimer(self._timer, force_view_change_freq, self.on_master_degradation) # Start periodic freshness check state_freshness_update_interval = self.config.STATE_FRESHNESS_UPDATE_INTERVAL if state_freshness_update_interval > 0: RepeatingTimer(self._timer, state_freshness_update_interval, self.check_freshness)
def __init__(self, data: ConsensusSharedData, timer: TimerService, bus: InternalBus, network: ExternalBus, db_manager: DatabaseManager, stasher: StashingRouter, is_master_degraded: Callable[[], bool], metrics: MetricsCollector = NullMetricsCollector()): self._data = data self._timer = timer self._bus = bus self._network = network self._stasher = stasher self._is_master_degraded = is_master_degraded self.metrics = metrics self._config = getConfig() self._instance_changes = \ InstanceChangeProvider(outdated_ic_interval=self._config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db=db_manager.get_store(NODE_STATUS_DB_LABEL), time_provider=timer.get_current_time) self._subscription = Subscription() self._subscription.subscribe(bus, VoteForViewChange, self.process_vote_for_view_change) self._subscription.subscribe(bus, NewViewAccepted, self.process_new_view_accepted) self._subscription.subscribe(stasher, InstanceChange, self.process_instance_change)
def test_update_instance_changes_in_db(instance_change_provider, tconf, node_status_db, time_provider): frm = "Node1" view_no = 1 msg = InstanceChange(view_no, Suspicions.PRIMARY_DEGRADED.code) assert not instance_change_provider.has_view(view_no) assert not instance_change_provider.has_inst_chng_from(view_no, frm) assert not _is_view_in_db(view_no, instance_change_provider) instance_change_provider.add_vote(msg, frm) assert instance_change_provider.has_view(view_no) assert instance_change_provider.has_inst_chng_from(view_no, frm) assert _is_view_in_db(view_no, instance_change_provider) instance_change_provider._node_status_db.close() assert instance_change_provider._node_status_db.closed instance_change_provider._node_status_db.open() new_instance_change_provider = InstanceChangeProvider( tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db, time_provider) assert new_instance_change_provider.has_view(view_no) assert new_instance_change_provider.has_inst_chng_from(view_no, frm)
def test_fail_update_instance_changes_from_db(instance_change_provider, tconf, node_status_db, time_provider, logsearch): # test updating cache with view without votes node_status_db.iterator = lambda include_value=True: { "3": node_status_db_serializer.serialize(None) }.items() provider = InstanceChangeProvider( tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db, time_provider) assert not provider.has_view(3) # test updating cache with Vote with incorrect timestamp format node_status_db.iterator = lambda include_value=True: { "3": node_status_db_serializer.serialize({"voter": ["a", 10.4]}) }.items() logs, _ = logsearch(msgs=[ "InstanceChangeProvider: timestamp in Vote .* : .* - .* must " "be of float or int type" ]) InstanceChangeProvider(tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db, time_provider) assert logs # test updating cache with Vote with incorrect reason format node_status_db.iterator = lambda include_value=True: { "3": node_status_db_serializer.serialize({"voter": [5, 10.4]}) }.items() logs, _ = logsearch(msgs=[ "InstanceChangeProvider: reason in Vote .* : .* - .* must " "be of int type" ]) InstanceChangeProvider(tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db, time_provider) assert logs # test updating cache with incorrect view_no format node_status_db.iterator = lambda include_value=True: { "a": node_status_db_serializer.serialize({"voter": [5, 25]}) }.items() logs, _ = logsearch( msgs=["InstanceChangeProvider: view_no='.*' " "must be of int type"]) InstanceChangeProvider(tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db, time_provider) assert logs
class ViewChanger(): def __init__(self, provider: ViewChangerDataProvider, timer: TimerService): self.provider = provider self._timer = timer self.pre_vc_strategy = None self._view_no = 0 # type: int self.inBox = deque() self.outBox = deque() self.inBoxRouter = Router( (InstanceChange, self.process_instance_change_msg), (ViewChangeDone, self.process_vchd_msg), (FutureViewChangeDone, self.process_future_view_vchd_msg) ) self.instance_changes = InstanceChangeProvider(self.config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db=self.provider.node_status_db) # Tracks if other nodes are indicating that this node is in lower view # than others. Keeps a map of view no to senders # TODO: Consider if sufficient ViewChangeDone for 2 different (and # higher views) are received, should one view change be interrupted in # between. self._next_view_indications = {} self._view_change_in_progress = False self.pre_view_change_in_progress = False self.previous_view_no = None self.previous_master_primary = None self.set_defaults() # Action for _schedule instanceChange messages self.instance_change_action = None # Count of instance change rounds self.instance_change_rounds = 0 # Time for view_change_starting self.start_view_change_ts = 0 # Last successful viewNo. # In some cases view_change process can be uncompleted in time. # In that case we want to know, which viewNo was successful (last completed view_change) self.last_completed_view_no = 0 # Force periodic view change if enabled in config force_view_change_freq = self.config.ForceViewChangeFreq if force_view_change_freq > 0: RepeatingTimer(self._timer, force_view_change_freq, self.on_master_degradation) # Start periodic freshness check state_freshness_update_interval = self.config.STATE_FRESHNESS_UPDATE_INTERVAL if state_freshness_update_interval > 0: RepeatingTimer(self._timer, state_freshness_update_interval, self.check_freshness) def __repr__(self): return "{}".format(self.name) # PROPERTIES @property def view_no(self): return self._view_no @view_no.setter def view_no(self, value): logger.info("{} setting view no to {}".format(self.name, value)) self._view_no = value self.provider.view_setting_handler(value) @property def name(self) -> str: return self.provider.name() @property def config(self) -> object: return self.provider.config() @property def quorums(self) -> Quorums: return self.provider.quorums() @property def view_change_in_progress(self) -> bool: return self._view_change_in_progress @view_change_in_progress.setter def view_change_in_progress(self, value: bool): self._view_change_in_progress = value self.provider.set_view_change_status(value) @property def quorum(self) -> int: return self.quorums.view_change_done.value @property def _hasViewChangeQuorum(self): # This method should just be present for master instance. """ Checks whether n-f nodes completed view change and whether one of them is the next primary """ num_of_ready_nodes = len(self._view_change_done) diff = self.quorum - num_of_ready_nodes if diff > 0: logger.info('{} needs {} ViewChangeDone messages'.format(self, diff)) return False logger.info("{} got view change quorum ({} >= {})". format(self.name, num_of_ready_nodes, self.quorum)) return True @property def has_view_change_from_primary(self) -> bool: if not self._has_view_change_from_primary: next_primary_name = self.provider.next_primary_name() if next_primary_name not in self._view_change_done: logger.info("{} has not received ViewChangeDone from the next " "primary {} (view_no: {}, totalNodes: {})". format(self.name, next_primary_name, self.view_no, self.quorums.n)) else: logger.info('{} received ViewChangeDone from primary {}'.format(self, next_primary_name)) self._has_view_change_from_primary = True return self._has_view_change_from_primary @property def has_acceptable_view_change_quorum(self): if not self._has_acceptable_view_change_quorum: self._has_acceptable_view_change_quorum = \ (self._hasViewChangeQuorum and self.has_view_change_from_primary) return self._has_acceptable_view_change_quorum @property def is_behind_for_view(self) -> bool: # Checks if the node is currently behind the accepted state for this # view, only makes sense to call when the node has an acceptable # view change quorum _, accepted_ledger_summary = self.get_sufficient_same_view_change_done_messages() for (ledgerId, own_ledger_size, _), (_, accepted_ledger_size, _) in \ zip(self.provider.ledger_summary(), accepted_ledger_summary): if own_ledger_size < accepted_ledger_size: logger.info("{} ledger {} sizes are differ: own {} accepted {}". format(self, ledgerId, own_ledger_size, accepted_ledger_size)) return True return False # __ PROPERTIES __ # EXTERNAL EVENTS def on_master_degradation(self): self.propose_view_change() def check_freshness(self): if self.is_state_fresh_enough(): logger.debug("{} not sending instance change because found state to be fresh enough".format(self)) return self.propose_view_change(Suspicions.STATE_SIGS_ARE_NOT_UPDATED) def send_instance_change_if_needed(self, proposed_view_no, reason): can, whyNot = self._canViewChange(proposed_view_no) # if scheduled action will be awakened after view change completed, # then this action must be stopped also. if not can and self.view_no < proposed_view_no and self.provider.is_primary_disconnected(): # Resend the same instance change message if we are not archive # InstanceChange quorum logger.info("Resend instance change message to all recipients") self.sendInstanceChange(proposed_view_no, reason) self._timer.schedule(self.config.INSTANCE_CHANGE_TIMEOUT, self.instance_change_action) logger.info("Count of rounds without quorum of " "instance change messages: {}".format(self.instance_change_rounds)) self.instance_change_rounds += 1 else: # ViewChange procedure was started, therefore stop scheduling # resending instanceChange messages logger.info("Stop scheduling instance change resending") self._timer.cancel(self.instance_change_action) self.instance_change_action = None self.instance_change_rounds = 0 def on_primary_loss(self): view_no = self.propose_view_change(Suspicions.PRIMARY_DISCONNECTED) if self.instance_change_action: # It's an action, scheduled for previous instanceChange round logger.info("Stop previous instance change resending schedule") self._timer.cancel(self.instance_change_action) self.instance_change_rounds = 0 self.instance_change_action = partial(self.send_instance_change_if_needed, view_no, Suspicions.PRIMARY_DISCONNECTED) self._timer.schedule(self.config.INSTANCE_CHANGE_TIMEOUT, self.instance_change_action) # TODO we have `on_primary_loss`, do we need that one? def on_primary_about_to_be_disconnected(self): self.propose_view_change(Suspicions.PRIMARY_ABOUT_TO_BE_DISCONNECTED) def on_suspicious_primary(self, suspicion: Suspicions): self.propose_view_change(suspicion) def on_view_change_not_completed_in_time(self): self.propose_view_change(Suspicions.INSTANCE_CHANGE_TIMEOUT) self.provider.schedule_resend_inst_chng() def on_replicas_count_changed(self): self.propose_view_change(Suspicions.REPLICAS_COUNT_CHANGED) def on_catchup_complete(self): if not self.provider.is_node_synced(): raise LogicError('on_catchup_complete can be called only after catchup completed') if not self.provider.is_primary() is None: raise LogicError('Primary on master replica cannot be elected yet') self._send_view_change_done_message() self._start_selection() def process_future_view_vchd_msg(self, future_vcd_msg: FutureViewChangeDone, frm): # if we already started a view change then do not decide on a new one if self.view_change_in_progress: return view_no = future_vcd_msg.vcd_msg.viewNo # ToDo maybe we should compare with last_completed_view_no instead of viewNo. if view_no <= self.view_no: # it means we already processed this future View Change Done return if view_no not in self._next_view_indications: self._next_view_indications[view_no] = {} self._next_view_indications[view_no][frm] = future_vcd_msg.vcd_msg self._do_view_change_by_future_vcd(view_no) # __ EXTERNAL EVENTS __ def process_instance_change_msg(self, instChg: InstanceChange, frm: str) -> None: """ Validate and process an instance change request. :param instChg: the instance change request :param frm: the name of the node that sent this `msg` """ if frm not in self.provider.connected_nodes(): self.provider.discard( instChg, "received instance change request: {} from {} " "which is not in connected list: {}".format( instChg, frm, self.provider.connected_nodes()), logger.info) return logger.info("{} received instance change request: {} from {}".format(self, instChg, frm)) # TODO: add sender to blacklist? if not isinstance(instChg.viewNo, int): self.provider.discard( instChg, "{}field view_no has incorrect type: {}".format( VIEW_CHANGE_PREFIX, type(instChg.viewNo))) elif instChg.viewNo <= self.view_no: self.provider.discard( instChg, "Received instance change request with view no {} " "which is not more than its view no {}".format( instChg.viewNo, self.view_no), logger.info) else: # Record instance changes for views but send instance change # only when found master to be degraded. if quorum of view changes # found then change view even if master not degraded self._on_verified_instance_change_msg(instChg, frm) if self.instance_changes.has_inst_chng_from(instChg.viewNo, self.name): logger.info("{} received instance change message {} but has already " "sent an instance change message".format(self, instChg)) elif not self.provider.is_master_degraded(): logger.info("{} received instance change message {} but did not " "find the master to be slow".format(self, instChg)) else: logger.display("{}{} found master degraded after receiving instance change" " message from {}".format(VIEW_CHANGE_PREFIX, self, frm)) self.sendInstanceChange(instChg.viewNo) def process_vchd_msg(self, msg: ViewChangeDone, sender: str) -> bool: """ Processes ViewChangeDone messages. Once n-f messages have been received, decides on a primary for specific replica. :param msg: ViewChangeDone message :param sender: the name of the node from which this message was sent """ logger.info("{}'s primary selector started processing of ViewChangeDone msg from {} : {}". format(self.name, sender, msg)) view_no = msg.viewNo if self.view_no != view_no: self.provider.discard(msg, '{} got Primary from {} for view no {} ' 'whereas current view no is {}'. format(self, sender, view_no, self.view_no), logMethod=logger.info) return False new_primary_name = msg.name if new_primary_name == self.previous_master_primary: self.provider.discard(msg, '{} got Primary from {} for {} who was primary of ' 'master in previous view too'. format(self, sender, new_primary_name), logMethod=logger.info) return False # Since a node can send ViewChangeDone more than one time self._on_verified_view_change_done_msg(msg, sender) # TODO why do we check that after the message tracking if self.provider.has_primary(): self.provider.discard(msg, "it already decided primary which is {}". format(self.provider.current_primary_name()), logger.info) return False self._start_selection() def send(self, msg): """ Send a message to the node. :param msg: the message to send """ logger.debug("{}'s view_changer sending {}".format(self.name, msg)) self.outBox.append(msg) async def serviceQueues(self, limit=None) -> int: """ Service at most `limit` messages from the inBox. :param limit: the maximum number of messages to service :return: the number of messages successfully processed """ # do not start any view changes until catch-up is finished! if not Mode.is_done_syncing(self.provider.node_mode()): return 0 return await self.inBoxRouter.handleAll(self.inBox, limit) def sendInstanceChange(self, view_no: int, suspicion=Suspicions.PRIMARY_DEGRADED): """ Broadcast an instance change request to all the remaining nodes :param view_no: the view number when the instance change is requested """ # If not found any sent instance change messages in last # `ViewChangeWindowSize` seconds or the last sent instance change # message was sent long enough ago then instance change message can be # sent otherwise no. logger.info( "{}{} sending an instance change with view_no {}" " since {}".format( VIEW_CHANGE_PREFIX, self, view_no, suspicion.reason)) logger.info("{}{} metrics for monitor: {}" .format(MONITORING_PREFIX, self, self.provider.pretty_metrics())) msg = self._create_instance_change_msg(view_no, suspicion.code) self.send(msg) # record instance change vote for self and try to change the view # if quorum is reached self._on_verified_instance_change_msg(msg, self.name) def _create_instance_change_msg(self, view_no, suspicion_code): return InstanceChange(view_no, suspicion_code) def _on_verified_instance_change_msg(self, msg, frm): view_no = msg.viewNo if not self.instance_changes.has_inst_chng_from(view_no, frm): self.instance_changes.add_vote(msg, frm) if view_no > self.view_no: self._start_view_change_by_instance_change(view_no) def _start_view_change_by_instance_change(self, view_no): # TODO: Need to handle skewed distributions which can arise due to # malicious nodes sending messages early on can, whyNot = self._canViewChange(view_no) if can: logger.display("{}{} initiating a view change to {} from {}". format(VIEW_CHANGE_PREFIX, self, view_no, self.view_no)) self.start_view_change(view_no) else: logger.info(whyNot) return can def _quorum_is_reached(self, count): return self.quorums.view_change_done.is_reached(count) def _do_view_change_by_future_vcd(self, view_no) -> bool: ind_count = len(self._next_view_indications[view_no]) if self._quorum_is_reached(ind_count): logger.display('{}{} starting view change for {} after {} view change ' 'indications from other nodes'.format(VIEW_CHANGE_PREFIX, self, view_no, ind_count)) self.start_view_change(view_no) return True return False def _canViewChange(self, proposedViewNo: int) -> (bool, str): """ Return whether there's quorum for view change for the proposed view number and its view is less than or equal to the proposed view """ msg = None quorum = self.quorums.view_change.value if not self.instance_changes.has_quorum(proposedViewNo, quorum): msg = '{} has no quorum for view {}'.format(self, proposedViewNo) elif not proposedViewNo > self.view_no: msg = '{} is in higher view more than {}'.format( self, proposedViewNo) return not bool(msg), msg def start_view_change(self, proposed_view_no: int, continue_vc=False): """ Trigger the view change process. :param proposed_view_no: the new view number after view change. """ # TODO: consider moving this to pool manager # TODO: view change is a special case, which can have different # implementations - we need to make this logic pluggable if self.pre_vc_strategy and (not continue_vc): self.pre_view_change_in_progress = True self.pre_vc_strategy.prepare_view_change(proposed_view_no) return elif self.pre_vc_strategy: self.pre_vc_strategy.on_strategy_complete() self.previous_view_no = self.view_no self.view_no = proposed_view_no self.pre_view_change_in_progress = False self.view_change_in_progress = True self.previous_master_primary = self.provider.current_primary_name() self.set_defaults() self._process_vcd_for_future_view() self.provider.notify_view_change_start() self.provider.start_catchup() def _process_vcd_for_future_view(self): # make sure that all received VCD messages for future view # (including the current view) are stored, as they will be needed for a quorum # to finish the View Change and start selection. # This is especially critical for Propagate Primary mode (on receiving CURRENT_STATE by a new node). if self.view_no in self._next_view_indications: for frm, vcd in self._next_view_indications[self.view_no].items(): # we call _on_verified_view_change_done_msg, not process_vchd_msg, # since we may be in propagate primary mode where some of validation inside process_vchd_msg # is not correct (for example, checking that the new primary differs from the current one) self._on_verified_view_change_done_msg(vcd, frm) # remove all for previous views for view_no in tuple(self._next_view_indications.keys()): if view_no <= self.view_no: del self._next_view_indications[view_no] def _on_verified_view_change_done_msg(self, msg, frm): new_primary_name = msg.name ledger_summary = msg.ledgerInfo # TODO what is the case when node sends several different # view change done messages data = (new_primary_name, ledger_summary) self._view_change_done[frm] = data def _start_selection(self): error = None if not self.provider.is_node_synced(): error = "mode is {}".format(self.provider.node_mode()) elif not self.has_acceptable_view_change_quorum: error = "has no view change quorum or no message from next primary" else: rv = self.get_sufficient_same_view_change_done_messages() if rv is None: error = "there are not sufficient same ViewChangeDone messages" elif not self._verify_primary(*rv): error = "failed to verify primary" if error is not None: logger.info('{} cannot start primary selection because {}'.format(self, error)) return if self.is_behind_for_view: logger.info('{} is synced and has an acceptable view change quorum ' 'but is behind the accepted state'.format(self)) self.provider.start_catchup() return self.provider.select_primaries() if self.view_change_in_progress: self.view_change_in_progress = False self.provider.notify_view_change_complete() # when we had INSTANCE_CHANGE message, they added into instanceChanges # by msg.view_no. When view change was occured and view_no is changed, # then we should delete all INSTANCE_CHANGE messages with current (already changed) # view_no (which used in corresponded INSTANCE_CHANGE messages) # Therefore we delete all INSTANCE_CHANGE messages from previous and current view number self.instance_changes.remove_view(self.view_no) self.previous_view_no = None self.previous_master_primary = None def set_defaults(self): # Tracks view change done message self._view_change_done = {} # replica name -> data # Set when an appropriate view change quorum is found which has # sufficient same ViewChangeDone messages self._primary_verified = False self._has_view_change_from_primary = False self._has_acceptable_view_change_quorum = False self._accepted_view_change_done_message = None def get_sufficient_same_view_change_done_messages(self) -> Optional[Tuple]: # Returns whether has a quorum of ViewChangeDone messages that are same # TODO: Does not look like optimal implementation. if self._accepted_view_change_done_message is None and \ self._view_change_done: votes = self._view_change_done.values() votes = [(nm, tuple(tuple(i) for i in info)) for nm, info in votes] (new_primary, ledger_info), vote_count = mostCommonElement(votes) if vote_count >= self.quorum: logger.info('{} found acceptable primary {} and ledger info {}'. format(self, new_primary, ledger_info)) self._accepted_view_change_done_message = (new_primary, ledger_info) else: logger.info('{} does not have acceptable primary, only {} votes for {}'. format(self, vote_count, (new_primary, ledger_info))) return self._accepted_view_change_done_message def _verify_primary(self, new_primary, ledger_info): """ This method is called when sufficient number of ViewChangeDone received and makes steps to switch to the new primary """ expected_primary = self.provider.next_primary_name() if new_primary != expected_primary: logger.error("{}{} expected next primary to be {}, but majority " "declared {} instead for view {}" .format(PRIMARY_SELECTION_PREFIX, self.name, expected_primary, new_primary, self.view_no)) return False self._primary_verified = True return True # TODO: check if ledger status is expected def _send_view_change_done_message(self): """ Sends ViewChangeDone message to other protocol participants """ new_primary_name = self.provider.next_primary_name() ledger_summary = self.provider.ledger_summary() message = ViewChangeDone(self.view_no, new_primary_name, ledger_summary) logger.info("{} is sending ViewChangeDone msg to all : {}".format(self, message)) self.send(message) self._on_verified_view_change_done_msg(message, self.name) # overridden method of PrimaryDecider def get_msgs_for_lagged_nodes(self) -> List[ViewChangeDone]: # Should not return a list, only done for compatibility with interface """ Returns the last accepted `ViewChangeDone` message. If no view change has happened returns ViewChangeDone with view no 0 to a newly joined node """ # TODO: Consider a case where more than one node joins immediately, # then one of the node might not have an accepted # ViewChangeDone message messages = [] accepted = self._accepted_view_change_done_message if accepted: messages.append(ViewChangeDone(self.last_completed_view_no, *accepted)) elif self.name in self._view_change_done: messages.append(ViewChangeDone(self.last_completed_view_no, *self._view_change_done[self.name])) else: logger.info('{} has no ViewChangeDone message to send for view {}'. format(self, self.view_no)) return messages def propose_view_change(self, suspicion=Suspicions.PRIMARY_DEGRADED): proposed_view_no = self.view_no # TODO: For some reason not incrementing view_no in most cases leads to lots of failing/flaky tests # if suspicion == Suspicions.INSTANCE_CHANGE_TIMEOUT or not self.view_change_in_progress: if suspicion != Suspicions.STATE_SIGS_ARE_NOT_UPDATED or not self.view_change_in_progress: proposed_view_no += 1 self.sendInstanceChange(proposed_view_no, suspicion) return proposed_view_no def is_state_fresh_enough(self): threshold = self.config.ACCEPTABLE_FRESHNESS_INTERVALS_COUNT * self.config.STATE_FRESHNESS_UPDATE_INTERVAL return self.provider.state_freshness() < threshold or (not self.view_change_in_progress and not Mode.is_done_syncing(self.provider.node_mode()))
class ViewChanger(): def __init__(self, provider: ViewChangerDataProvider, timer: TimerService): self.provider = provider self._timer = timer self.inBox = deque() self.outBox = deque() self.inBoxRouter = Router( (InstanceChange, self.process_instance_change_msg) ) self.instance_changes = InstanceChangeProvider(self.config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db=self.provider.node_status_db) self.previous_view_no = None # Action for _schedule instanceChange messages self.instance_change_action = None # Count of instance change rounds self.instance_change_rounds = 0 # Time for view_change_starting self.start_view_change_ts = 0 # Force periodic view change if enabled in config force_view_change_freq = self.config.ForceViewChangeFreq if force_view_change_freq > 0: RepeatingTimer(self._timer, force_view_change_freq, self.on_master_degradation) # Start periodic freshness check state_freshness_update_interval = self.config.STATE_FRESHNESS_UPDATE_INTERVAL if state_freshness_update_interval > 0: RepeatingTimer(self._timer, state_freshness_update_interval, self.check_freshness) def __repr__(self): return "{}".format(self.name) # PROPERTIES @property def view_no(self): return self.provider.view_no() @property def name(self) -> str: return self.provider.name() @property def config(self) -> object: return self.provider.config() @property def quorums(self) -> Quorums: return self.provider.quorums() @property def view_change_in_progress(self) -> bool: return self.provider.view_change_in_progress() @property def quorum(self) -> int: return self.quorums.view_change_done.value # __ PROPERTIES __ # EXTERNAL EVENTS def on_master_degradation(self): self.propose_view_change() def check_freshness(self): if self.is_state_fresh_enough(): logger.debug("{} not sending instance change because found state to be fresh enough".format(self)) return self.propose_view_change(Suspicions.STATE_SIGS_ARE_NOT_UPDATED) def send_instance_change_if_needed(self, proposed_view_no, reason): can, whyNot = self._canViewChange(proposed_view_no) # if scheduled action will be awakened after view change completed, # then this action must be stopped also. if not can and self.view_no < proposed_view_no and self.provider.is_primary_disconnected(): # Resend the same instance change message if we are not archive # InstanceChange quorum logger.info("Resend instance change message to all recipients") self.sendInstanceChange(proposed_view_no, reason) self._timer.schedule(self.config.NEW_VIEW_TIMEOUT, self.instance_change_action) logger.info("Count of rounds without quorum of " "instance change messages: {}".format(self.instance_change_rounds)) self.instance_change_rounds += 1 else: # ViewChange procedure was started, therefore stop scheduling # resending instanceChange messages logger.info("Stop scheduling instance change resending") self._timer.cancel(self.instance_change_action) self.instance_change_action = None self.instance_change_rounds = 0 def on_primary_loss(self): view_no = self.propose_view_change(Suspicions.PRIMARY_DISCONNECTED) if self.instance_change_action: # It's an action, scheduled for previous instanceChange round logger.info("Stop previous instance change resending schedule") self._timer.cancel(self.instance_change_action) self.instance_change_rounds = 0 self.instance_change_action = partial(self.send_instance_change_if_needed, view_no, Suspicions.PRIMARY_DISCONNECTED) self._timer.schedule(self.config.NEW_VIEW_TIMEOUT, self.instance_change_action) # TODO we have `on_primary_loss`, do we need that one? def on_primary_about_to_be_disconnected(self): self.propose_view_change(Suspicions.PRIMARY_ABOUT_TO_BE_DISCONNECTED) def on_suspicious_primary(self, suspicion: Suspicions): self.propose_view_change(suspicion) def on_node_count_changed(self): self.propose_view_change(Suspicions.NODE_COUNT_CHANGED) # __ EXTERNAL EVENTS __ def process_instance_change_msg(self, instChg: InstanceChange, frm: str) -> None: """ Validate and process an instance change request. :param instChg: the instance change request :param frm: the name of the node that sent this `msg` """ if frm not in self.provider.connected_nodes(): self.provider.discard( instChg, "received instance change request: {} from {} " "which is not in connected list: {}".format( instChg, frm, self.provider.connected_nodes()), logger.info) return logger.info("{} received instance change request: {} from {}".format(self, instChg, frm)) # TODO: add sender to blacklist? if not isinstance(instChg.viewNo, int): self.provider.discard( instChg, "{}field view_no has incorrect type: {}".format( VIEW_CHANGE_PREFIX, type(instChg.viewNo))) elif instChg.viewNo <= self.view_no: self.provider.discard( instChg, "Received instance change request with view no {} " "which is not more than its view no {}".format( instChg.viewNo, self.view_no), logger.info) else: # Record instance changes for views but send instance change # only when found master to be degraded. if quorum of view changes # found then change view even if master not degraded self._on_verified_instance_change_msg(instChg, frm) if self.instance_changes.has_inst_chng_from(instChg.viewNo, self.name): logger.info("{} received instance change message {} but has already " "sent an instance change message".format(self, instChg)) elif not self.provider.is_master_degraded(): logger.info("{} received instance change message {} but did not " "find the master to be slow".format(self, instChg)) else: logger.display("{}{} found master degraded after receiving instance change" " message from {}".format(VIEW_CHANGE_PREFIX, self, frm)) self.sendInstanceChange(instChg.viewNo) def send(self, msg): """ Send a message to the node. :param msg: the message to send """ logger.debug("{}'s view_changer sending {}".format(self.name, msg)) self.outBox.append(msg) async def serviceQueues(self, limit=None) -> int: """ Service at most `limit` messages from the inBox. :param limit: the maximum number of messages to service :return: the number of messages successfully processed """ # do not start any view changes until catch-up is finished! if not Mode.is_done_syncing(self.provider.node_mode()): return 0 return await self.inBoxRouter.handleAll(self.inBox, limit) def sendInstanceChange(self, view_no: int, suspicion=Suspicions.PRIMARY_DEGRADED): """ Broadcast an instance change request to all the remaining nodes :param view_no: the view number when the instance change is requested """ # If not found any sent instance change messages in last # `ViewChangeWindowSize` seconds or the last sent instance change # message was sent long enough ago then instance change message can be # sent otherwise no. logger.info( "{}{} sending an instance change with view_no {}" " since {}".format( VIEW_CHANGE_PREFIX, self, view_no, suspicion.reason)) logger.info("{}{} metrics for monitor: {}" .format(MONITORING_PREFIX, self, self.provider.pretty_metrics())) msg = self._create_instance_change_msg(view_no, suspicion.code) self.send(msg) # record instance change vote for self and try to change the view # if quorum is reached self._on_verified_instance_change_msg(msg, self.name) def _create_instance_change_msg(self, view_no, suspicion_code): return InstanceChange(view_no, suspicion_code) def _on_verified_instance_change_msg(self, msg, frm): view_no = msg.viewNo if not self.instance_changes.has_inst_chng_from(view_no, frm): self.instance_changes.add_vote(msg, frm) if view_no > self.view_no: self._start_view_change_by_instance_change(view_no) def _start_view_change_by_instance_change(self, view_no): # TODO: Need to handle skewed distributions which can arise due to # malicious nodes sending messages early on can, whyNot = self._canViewChange(view_no) if can: logger.display("{}{} initiating a view change to {} from {}". format(VIEW_CHANGE_PREFIX, self, view_no, self.view_no)) self.start_view_change(view_no) else: logger.info(whyNot) return can def _canViewChange(self, proposedViewNo: int) -> (bool, str): """ Return whether there's quorum for view change for the proposed view number and its view is less than or equal to the proposed view """ msg = None quorum = self.quorums.view_change.value if not self.instance_changes.has_quorum(proposedViewNo, quorum): msg = '{} has no quorum for view {}'.format(self, proposedViewNo) elif not proposedViewNo > self.view_no: msg = '{} is in higher view more than {}'.format( self, proposedViewNo) return not bool(msg), msg def start_view_change(self, proposed_view_no: int, continue_vc=False): """ Trigger the view change process. :param proposed_view_no: the new view number after view change. """ self.previous_view_no = self.view_no self.provider.notify_view_change_start() self.provider.start_view_change(proposed_view_no) # TODO: Check whether these still need to be called somewhere after view change: # - self.instance_changes.remove_view(self.view_no) def propose_view_change(self, suspicion=Suspicions.PRIMARY_DEGRADED): proposed_view_no = self.view_no if suspicion != Suspicions.STATE_SIGS_ARE_NOT_UPDATED or not self.view_change_in_progress: proposed_view_no += 1 self.sendInstanceChange(proposed_view_no, suspicion) return proposed_view_no def is_state_fresh_enough(self): threshold = self.config.ACCEPTABLE_FRESHNESS_INTERVALS_COUNT * self.config.STATE_FRESHNESS_UPDATE_INTERVAL return self.provider.state_freshness() < threshold or (not self.view_change_in_progress and not Mode.is_done_syncing(self.provider.node_mode()))
class ViewChangeTriggerService: def __init__(self, data: ConsensusSharedData, timer: TimerService, bus: InternalBus, network: ExternalBus, db_manager: DatabaseManager, stasher: StashingRouter, is_master_degraded: Callable[[], bool], metrics: MetricsCollector = NullMetricsCollector()): self._data = data self._timer = timer self._bus = bus self._network = network self._stasher = stasher self._is_master_degraded = is_master_degraded self.metrics = metrics self._config = getConfig() self._instance_changes = \ InstanceChangeProvider(outdated_ic_interval=self._config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db=db_manager.get_store(NODE_STATUS_DB_LABEL), time_provider=timer.get_current_time) self._subscription = Subscription() self._subscription.subscribe(bus, VoteForViewChange, self.process_vote_for_view_change) self._subscription.subscribe(bus, NewViewAccepted, self.process_new_view_accepted) self._subscription.subscribe(stasher, InstanceChange, self.process_instance_change) def cleanup(self): self._subscription.unsubscribe_all() @property def name(self): return replica_name_to_node_name(self._data.name) def __repr__(self): return self.name def process_vote_for_view_change(self, msg: VoteForViewChange): proposed_view_no = self._data.view_no # TODO: Some time ago it was proposed that view_no should not be increased during proposal # if view change is already in progress, unless suspicion code is "view change is taking too long". # Idea was to improve stability of view change triggering, however for some reason this change lead # to lots of failing/flaky tests. This still needs to be investigated. # if suspicion == Suspicions.INSTANCE_CHANGE_TIMEOUT or not self.view_change_in_progress: if msg.suspicion != Suspicions.STATE_SIGS_ARE_NOT_UPDATED or not self._data.waiting_for_new_view: proposed_view_no += 1 self._send_instance_change(proposed_view_no, msg.suspicion) def process_instance_change(self, msg: InstanceChange, frm: str): frm = replica_name_to_node_name(frm) # TODO: Do we really need this? if frm not in self._network.connecteds: return DISCARD, "instance change request: {} from {} which is not in connected list: {}".\ format(msg, frm, self._network.connecteds) if not self._data.is_participating: return STASH_CATCH_UP, CATCHING_UP logger.info("{} received instance change request: {} from {}".format(self, msg, frm)) if msg.viewNo <= self._data.view_no: return DISCARD, "instance change request with view no {} which is not more than its view no {}".\ format(msg.viewNo, self._data.view_no) # Record instance changes for views but send instance change # only when found master to be degraded. if quorum of view changes # found then change view even if master not degraded self._on_verified_instance_change_msg(msg, frm) if self._instance_changes.has_inst_chng_from(msg.viewNo, self.name): logger.info("{} received instance change message {} " "but has already sent an instance change message".format(self, msg)) elif not self._is_master_degraded(): logger.info("{} received instance change message {} " "but did not find the master to be slow".format(self, msg)) else: logger.display("{}{} found master degraded after " "receiving instance change message from {}".format(VIEW_CHANGE_PREFIX, self, frm)) self._send_instance_change(msg.viewNo, Suspicions.PRIMARY_DEGRADED) def process_new_view_accepted(self, msg: NewViewAccepted): self._instance_changes.remove_view(self._data.view_no) def _send_instance_change(self, view_no: int, suspicion: Suspicion): logger.info("{}{} sending an instance change with view_no {} since {}". format(VIEW_CHANGE_PREFIX, self, view_no, suspicion.reason)) msg = InstanceChange(view_no, suspicion.code) self._network.send(msg) # record instance change vote for self and try to change the view if quorum is reached self._on_verified_instance_change_msg(msg, self.name) def _on_verified_instance_change_msg(self, msg: InstanceChange, frm: str): view_no = msg.viewNo if not self._instance_changes.has_inst_chng_from(view_no, frm): self._instance_changes.add_vote(msg, frm) if view_no > self._data.view_no: self._try_start_view_change_by_instance_change(view_no) def _try_start_view_change_by_instance_change(self, proposed_view_no: int) -> bool: # TODO: Need to handle skewed distributions which can arise due to # malicious nodes sending messages early on can, why_not = self._can_view_change(proposed_view_no) if can: logger.display("{}{} initiating a view change to {} from {}". format(VIEW_CHANGE_PREFIX, self, proposed_view_no, self._data.view_no)) self._bus.send(NodeNeedViewChange(view_no=proposed_view_no)) else: logger.info(why_not) return can def _can_view_change(self, proposed_view_no: int) -> (bool, str): quorum = self._data.quorums.view_change.value if not self._instance_changes.has_quorum(proposed_view_no, quorum): return False, '{} has no quorum for view {}'.format(self, proposed_view_no) if not proposed_view_no > self._data.view_no: return False, '{} is in higher view more than {}'.format(self, proposed_view_no) return True, ''
def instance_change_provider(tconf, node_status_db, time_provider): return InstanceChangeProvider( tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db, time_provider)