Beispiel #1
0
    def __init__(self, provider: ViewChangerDataProvider, timer: TimerService):
        self.provider = provider
        self._timer = timer

        self.inBox = deque()
        self.outBox = deque()
        self.inBoxRouter = Router(
            (InstanceChange, self.process_instance_change_msg)
        )

        self.instance_changes = InstanceChangeProvider(self.config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL,
                                                       node_status_db=self.provider.node_status_db)

        self.previous_view_no = None

        # Action for _schedule instanceChange messages
        self.instance_change_action = None

        # Count of instance change rounds
        self.instance_change_rounds = 0

        # Time for view_change_starting
        self.start_view_change_ts = 0

        # Force periodic view change if enabled in config
        force_view_change_freq = self.config.ForceViewChangeFreq
        if force_view_change_freq > 0:
            RepeatingTimer(self._timer, force_view_change_freq, self.on_master_degradation)

        # Start periodic freshness check
        state_freshness_update_interval = self.config.STATE_FRESHNESS_UPDATE_INTERVAL
        if state_freshness_update_interval > 0:
            RepeatingTimer(self._timer, state_freshness_update_interval, self.check_freshness)
Beispiel #2
0
    def __init__(self, provider: ViewChangerDataProvider, timer: TimerService):
        self.provider = provider
        self._timer = timer
        self.pre_vc_strategy = None

        self._view_no = 0  # type: int

        self.inBox = deque()
        self.outBox = deque()
        self.inBoxRouter = Router(
            (InstanceChange, self.process_instance_change_msg),
            (ViewChangeDone, self.process_vchd_msg),
            (FutureViewChangeDone, self.process_future_view_vchd_msg))

        self.instance_changes = InstanceChangeProvider(
            self.config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL,
            node_status_db=self.provider.node_status_db)

        # Tracks if other nodes are indicating that this node is in lower view
        # than others. Keeps a map of view no to senders
        # TODO: Consider if sufficient ViewChangeDone for 2 different (and
        # higher views) are received, should one view change be interrupted in
        # between.
        self._next_view_indications = {}

        self._view_change_in_progress = False
        self.pre_view_change_in_progress = False

        self.previous_view_no = None
        self.previous_master_primary = None

        self.set_defaults()

        self.initInsChngThrottling()

        # Action for _schedule instanceChange messages
        self.instance_change_action = None

        # Count of instance change rounds
        self.instance_change_rounds = 0

        # Time for view_change_starting
        self.start_view_change_ts = 0

        # Last successful viewNo.
        # In some cases view_change process can be uncompleted in time.
        # In that case we want to know, which viewNo was successful (last completed view_change)
        self.last_completed_view_no = 0

        # Force periodic view change if enabled in config
        force_view_change_freq = self.config.ForceViewChangeFreq
        if force_view_change_freq > 0:
            RepeatingTimer(self._timer, force_view_change_freq,
                           self.on_master_degradation)

        # Start periodic freshness check
        state_freshness_update_interval = self.config.STATE_FRESHNESS_UPDATE_INTERVAL
        if state_freshness_update_interval > 0:
            RepeatingTimer(self._timer, state_freshness_update_interval,
                           self.check_freshness)
Beispiel #3
0
    def __init__(self,
                 data: ConsensusSharedData,
                 timer: TimerService,
                 bus: InternalBus,
                 network: ExternalBus,
                 db_manager: DatabaseManager,
                 stasher: StashingRouter,
                 is_master_degraded: Callable[[], bool],
                 metrics: MetricsCollector = NullMetricsCollector()):
        self._data = data
        self._timer = timer
        self._bus = bus
        self._network = network
        self._stasher = stasher
        self._is_master_degraded = is_master_degraded
        self.metrics = metrics

        self._config = getConfig()

        self._instance_changes = \
            InstanceChangeProvider(outdated_ic_interval=self._config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL,
                                   node_status_db=db_manager.get_store(NODE_STATUS_DB_LABEL),
                                   time_provider=timer.get_current_time)

        self._subscription = Subscription()
        self._subscription.subscribe(bus, VoteForViewChange, self.process_vote_for_view_change)
        self._subscription.subscribe(bus, NewViewAccepted, self.process_new_view_accepted)
        self._subscription.subscribe(stasher, InstanceChange, self.process_instance_change)
def test_update_instance_changes_in_db(instance_change_provider, tconf,
                                       node_status_db, time_provider):
    frm = "Node1"
    view_no = 1
    msg = InstanceChange(view_no, Suspicions.PRIMARY_DEGRADED.code)

    assert not instance_change_provider.has_view(view_no)
    assert not instance_change_provider.has_inst_chng_from(view_no, frm)
    assert not _is_view_in_db(view_no, instance_change_provider)

    instance_change_provider.add_vote(msg, frm)
    assert instance_change_provider.has_view(view_no)
    assert instance_change_provider.has_inst_chng_from(view_no, frm)
    assert _is_view_in_db(view_no, instance_change_provider)

    instance_change_provider._node_status_db.close()
    assert instance_change_provider._node_status_db.closed
    instance_change_provider._node_status_db.open()

    new_instance_change_provider = InstanceChangeProvider(
        tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db,
        time_provider)
    assert new_instance_change_provider.has_view(view_no)
    assert new_instance_change_provider.has_inst_chng_from(view_no, frm)
Beispiel #5
0
def test_fail_update_instance_changes_from_db(instance_change_provider, tconf,
                                              node_status_db, time_provider,
                                              logsearch):
    # test updating cache with view without votes
    node_status_db.iterator = lambda include_value=True: {
        "3": node_status_db_serializer.serialize(None)
    }.items()
    provider = InstanceChangeProvider(
        tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db,
        time_provider)
    assert not provider.has_view(3)

    # test updating cache with Vote with incorrect timestamp format
    node_status_db.iterator = lambda include_value=True: {
        "3": node_status_db_serializer.serialize({"voter": ["a", 10.4]})
    }.items()
    logs, _ = logsearch(msgs=[
        "InstanceChangeProvider: timestamp in Vote .* : .* - .* must "
        "be of float or int type"
    ])
    InstanceChangeProvider(tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL,
                           node_status_db, time_provider)
    assert logs

    # test updating cache with Vote with incorrect reason format
    node_status_db.iterator = lambda include_value=True: {
        "3": node_status_db_serializer.serialize({"voter": [5, 10.4]})
    }.items()
    logs, _ = logsearch(msgs=[
        "InstanceChangeProvider: reason in Vote .* : .* - .* must "
        "be of int type"
    ])
    InstanceChangeProvider(tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL,
                           node_status_db, time_provider)
    assert logs

    # test updating cache with incorrect view_no format
    node_status_db.iterator = lambda include_value=True: {
        "a": node_status_db_serializer.serialize({"voter": [5, 25]})
    }.items()
    logs, _ = logsearch(
        msgs=["InstanceChangeProvider: view_no='.*' "
              "must be of int type"])
    InstanceChangeProvider(tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL,
                           node_status_db, time_provider)
    assert logs
Beispiel #6
0
class ViewChanger():

    def __init__(self, provider: ViewChangerDataProvider, timer: TimerService):
        self.provider = provider
        self._timer = timer
        self.pre_vc_strategy = None

        self._view_no = 0  # type: int

        self.inBox = deque()
        self.outBox = deque()
        self.inBoxRouter = Router(
            (InstanceChange, self.process_instance_change_msg),
            (ViewChangeDone, self.process_vchd_msg),
            (FutureViewChangeDone, self.process_future_view_vchd_msg)
        )

        self.instance_changes = InstanceChangeProvider(self.config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL,
                                                       node_status_db=self.provider.node_status_db)

        # Tracks if other nodes are indicating that this node is in lower view
        # than others. Keeps a map of view no to senders
        # TODO: Consider if sufficient ViewChangeDone for 2 different (and
        # higher views) are received, should one view change be interrupted in
        # between.
        self._next_view_indications = {}

        self._view_change_in_progress = False
        self.pre_view_change_in_progress = False

        self.previous_view_no = None
        self.previous_master_primary = None

        self.set_defaults()

        # Action for _schedule instanceChange messages
        self.instance_change_action = None

        # Count of instance change rounds
        self.instance_change_rounds = 0

        # Time for view_change_starting
        self.start_view_change_ts = 0

        # Last successful viewNo.
        # In some cases view_change process can be uncompleted in time.
        # In that case we want to know, which viewNo was successful (last completed view_change)
        self.last_completed_view_no = 0

        # Force periodic view change if enabled in config
        force_view_change_freq = self.config.ForceViewChangeFreq
        if force_view_change_freq > 0:
            RepeatingTimer(self._timer, force_view_change_freq, self.on_master_degradation)

        # Start periodic freshness check
        state_freshness_update_interval = self.config.STATE_FRESHNESS_UPDATE_INTERVAL
        if state_freshness_update_interval > 0:
            RepeatingTimer(self._timer, state_freshness_update_interval, self.check_freshness)

    def __repr__(self):
        return "{}".format(self.name)

    # PROPERTIES

    @property
    def view_no(self):
        return self._view_no

    @view_no.setter
    def view_no(self, value):
        logger.info("{} setting view no to {}".format(self.name, value))
        self._view_no = value
        self.provider.view_setting_handler(value)

    @property
    def name(self) -> str:
        return self.provider.name()

    @property
    def config(self) -> object:
        return self.provider.config()

    @property
    def quorums(self) -> Quorums:
        return self.provider.quorums()

    @property
    def view_change_in_progress(self) -> bool:
        return self._view_change_in_progress

    @view_change_in_progress.setter
    def view_change_in_progress(self, value: bool):
        self._view_change_in_progress = value
        self.provider.set_view_change_status(value)

    @property
    def quorum(self) -> int:
        return self.quorums.view_change_done.value

    @property
    def _hasViewChangeQuorum(self):
        # This method should just be present for master instance.
        """
        Checks whether n-f nodes completed view change and whether one
        of them is the next primary
        """
        num_of_ready_nodes = len(self._view_change_done)
        diff = self.quorum - num_of_ready_nodes
        if diff > 0:
            logger.info('{} needs {} ViewChangeDone messages'.format(self, diff))
            return False

        logger.info("{} got view change quorum ({} >= {})".
                    format(self.name, num_of_ready_nodes, self.quorum))
        return True

    @property
    def has_view_change_from_primary(self) -> bool:
        if not self._has_view_change_from_primary:
            next_primary_name = self.provider.next_primary_name()

            if next_primary_name not in self._view_change_done:
                logger.info("{} has not received ViewChangeDone from the next "
                            "primary {} (view_no: {}, totalNodes: {})".
                            format(self.name, next_primary_name, self.view_no, self.quorums.n))
            else:
                logger.info('{} received ViewChangeDone from primary {}'.format(self, next_primary_name))
                self._has_view_change_from_primary = True

        return self._has_view_change_from_primary

    @property
    def has_acceptable_view_change_quorum(self):
        if not self._has_acceptable_view_change_quorum:
            self._has_acceptable_view_change_quorum = \
                (self._hasViewChangeQuorum and self.has_view_change_from_primary)
        return self._has_acceptable_view_change_quorum

    @property
    def is_behind_for_view(self) -> bool:
        # Checks if the node is currently behind the accepted state for this
        # view, only makes sense to call when the node has an acceptable
        # view change quorum
        _, accepted_ledger_summary = self.get_sufficient_same_view_change_done_messages()
        for (ledgerId, own_ledger_size, _), (_, accepted_ledger_size, _) in \
                zip(self.provider.ledger_summary(), accepted_ledger_summary):
            if own_ledger_size < accepted_ledger_size:
                logger.info("{} ledger {} sizes are differ: own {} accepted {}".
                            format(self, ledgerId, own_ledger_size, accepted_ledger_size))
                return True
        return False

    # __ PROPERTIES __

    # EXTERNAL EVENTS

    def on_master_degradation(self):
        self.propose_view_change()

    def check_freshness(self):
        if self.is_state_fresh_enough():
            logger.debug("{} not sending instance change because found state to be fresh enough".format(self))
            return
        self.propose_view_change(Suspicions.STATE_SIGS_ARE_NOT_UPDATED)

    def send_instance_change_if_needed(self, proposed_view_no, reason):
        can, whyNot = self._canViewChange(proposed_view_no)
        # if scheduled action will be awakened after view change completed,
        # then this action must be stopped also.
        if not can and self.view_no < proposed_view_no and self.provider.is_primary_disconnected():
            # Resend the same instance change message if we are not archive
            # InstanceChange quorum
            logger.info("Resend instance change message to all recipients")
            self.sendInstanceChange(proposed_view_no, reason)
            self._timer.schedule(self.config.INSTANCE_CHANGE_TIMEOUT,
                                 self.instance_change_action)
            logger.info("Count of rounds without quorum of "
                        "instance change messages: {}".format(self.instance_change_rounds))
            self.instance_change_rounds += 1
        else:
            # ViewChange procedure was started, therefore stop scheduling
            # resending instanceChange messages
            logger.info("Stop scheduling instance change resending")
            self._timer.cancel(self.instance_change_action)
            self.instance_change_action = None
            self.instance_change_rounds = 0

    def on_primary_loss(self):
        view_no = self.propose_view_change(Suspicions.PRIMARY_DISCONNECTED)
        if self.instance_change_action:
            # It's an action, scheduled for previous instanceChange round
            logger.info("Stop previous instance change resending schedule")
            self._timer.cancel(self.instance_change_action)
            self.instance_change_rounds = 0
        self.instance_change_action = partial(self.send_instance_change_if_needed,
                                              view_no,
                                              Suspicions.PRIMARY_DISCONNECTED)
        self._timer.schedule(self.config.INSTANCE_CHANGE_TIMEOUT,
                             self.instance_change_action)

    # TODO we have `on_primary_loss`, do we need that one?
    def on_primary_about_to_be_disconnected(self):
        self.propose_view_change(Suspicions.PRIMARY_ABOUT_TO_BE_DISCONNECTED)

    def on_suspicious_primary(self, suspicion: Suspicions):
        self.propose_view_change(suspicion)

    def on_view_change_not_completed_in_time(self):
        self.propose_view_change(Suspicions.INSTANCE_CHANGE_TIMEOUT)
        self.provider.schedule_resend_inst_chng()

    def on_replicas_count_changed(self):
        self.propose_view_change(Suspicions.REPLICAS_COUNT_CHANGED)

    def on_catchup_complete(self):
        if not self.provider.is_node_synced():
            raise LogicError('on_catchup_complete can be called only after catchup completed')
        if not self.provider.is_primary() is None:
            raise LogicError('Primary on master replica cannot be elected yet')
        self._send_view_change_done_message()
        self._start_selection()

    def process_future_view_vchd_msg(self, future_vcd_msg: FutureViewChangeDone, frm):
        # if we already started a view change then do not decide on a new one
        if self.view_change_in_progress:
            return

        view_no = future_vcd_msg.vcd_msg.viewNo
        # ToDo maybe we should compare with last_completed_view_no instead of viewNo.
        if view_no <= self.view_no:
            # it means we already processed this future View Change Done
            return

        if view_no not in self._next_view_indications:
            self._next_view_indications[view_no] = {}
        self._next_view_indications[view_no][frm] = future_vcd_msg.vcd_msg
        self._do_view_change_by_future_vcd(view_no)

    # __ EXTERNAL EVENTS __

    def process_instance_change_msg(self, instChg: InstanceChange, frm: str) -> None:
        """
        Validate and process an instance change request.

        :param instChg: the instance change request
        :param frm: the name of the node that sent this `msg`
        """
        if frm not in self.provider.connected_nodes():
            self.provider.discard(
                instChg,
                "received instance change request: {} from {} "
                "which is not in connected list: {}".format(
                    instChg, frm, self.provider.connected_nodes()), logger.info)
            return

        logger.info("{} received instance change request: {} from {}".format(self, instChg, frm))

        # TODO: add sender to blacklist?
        if not isinstance(instChg.viewNo, int):
            self.provider.discard(
                instChg, "{}field view_no has incorrect type: {}".format(
                    VIEW_CHANGE_PREFIX, type(instChg.viewNo)))
        elif instChg.viewNo <= self.view_no:
            self.provider.discard(
                instChg,
                "Received instance change request with view no {} "
                "which is not more than its view no {}".format(
                    instChg.viewNo, self.view_no), logger.info)
        else:
            # Record instance changes for views but send instance change
            # only when found master to be degraded. if quorum of view changes
            #  found then change view even if master not degraded
            self._on_verified_instance_change_msg(instChg, frm)

            if self.instance_changes.has_inst_chng_from(instChg.viewNo, self.name):
                logger.info("{} received instance change message {} but has already "
                            "sent an instance change message".format(self, instChg))
            elif not self.provider.is_master_degraded():
                logger.info("{} received instance change message {} but did not "
                            "find the master to be slow".format(self, instChg))
            else:
                logger.display("{}{} found master degraded after receiving instance change"
                               " message from {}".format(VIEW_CHANGE_PREFIX, self, frm))
                self.sendInstanceChange(instChg.viewNo)

    def process_vchd_msg(self, msg: ViewChangeDone, sender: str) -> bool:
        """
        Processes ViewChangeDone messages. Once n-f messages have been
        received, decides on a primary for specific replica.

        :param msg: ViewChangeDone message
        :param sender: the name of the node from which this message was sent
        """
        logger.info("{}'s primary selector started processing of ViewChangeDone msg from {} : {}".
                    format(self.name, sender, msg))
        view_no = msg.viewNo
        if self.view_no != view_no:
            self.provider.discard(msg, '{} got Primary from {} for view no {} '
                                       'whereas current view no is {}'.
                                  format(self, sender, view_no, self.view_no),
                                  logMethod=logger.info)
            return False

        new_primary_name = msg.name
        if new_primary_name == self.previous_master_primary:
            self.provider.discard(msg, '{} got Primary from {} for {} who was primary of '
                                       'master in previous view too'.
                                  format(self, sender, new_primary_name),
                                  logMethod=logger.info)
            return False

        # Since a node can send ViewChangeDone more than one time
        self._on_verified_view_change_done_msg(msg, sender)
        # TODO why do we check that after the message tracking
        if self.provider.has_primary():
            self.provider.discard(msg, "it already decided primary which is {}".
                                  format(self.provider.current_primary_name()), logger.info)
            return False

        self._start_selection()

    def send(self, msg):
        """
        Send a message to the node.

        :param msg: the message to send
        """
        logger.debug("{}'s view_changer sending {}".format(self.name, msg))
        self.outBox.append(msg)

    async def serviceQueues(self, limit=None) -> int:
        """
        Service at most `limit` messages from the inBox.

        :param limit: the maximum number of messages to service
        :return: the number of messages successfully processed
        """
        # do not start any view changes until catch-up is finished!
        if not Mode.is_done_syncing(self.provider.node_mode()):
            return 0
        return await self.inBoxRouter.handleAll(self.inBox, limit)

    def sendInstanceChange(self, view_no: int,
                           suspicion=Suspicions.PRIMARY_DEGRADED):
        """
        Broadcast an instance change request to all the remaining nodes

        :param view_no: the view number when the instance change is requested
        """

        # If not found any sent instance change messages in last
        # `ViewChangeWindowSize` seconds or the last sent instance change
        # message was sent long enough ago then instance change message can be
        # sent otherwise no.

        logger.info(
            "{}{} sending an instance change with view_no {}"
            " since {}".format(
                VIEW_CHANGE_PREFIX,
                self,
                view_no,
                suspicion.reason))
        logger.info("{}{} metrics for monitor: {}"
                    .format(MONITORING_PREFIX, self,
                            self.provider.pretty_metrics()))
        msg = self._create_instance_change_msg(view_no, suspicion.code)
        self.send(msg)
        # record instance change vote for self and try to change the view
        # if quorum is reached
        self._on_verified_instance_change_msg(msg, self.name)

    def _create_instance_change_msg(self, view_no, suspicion_code):
        return InstanceChange(view_no, suspicion_code)

    def _on_verified_instance_change_msg(self, msg, frm):
        view_no = msg.viewNo

        if not self.instance_changes.has_inst_chng_from(view_no, frm):
            self.instance_changes.add_vote(msg, frm)
            if view_no > self.view_no:
                self._start_view_change_by_instance_change(view_no)

    def _start_view_change_by_instance_change(self, view_no):
        # TODO: Need to handle skewed distributions which can arise due to
        # malicious nodes sending messages early on
        can, whyNot = self._canViewChange(view_no)
        if can:
            logger.display("{}{} initiating a view change to {} from {}".
                           format(VIEW_CHANGE_PREFIX, self, view_no, self.view_no))
            self.start_view_change(view_no)
        else:
            logger.info(whyNot)
        return can

    def _quorum_is_reached(self, count):
        return self.quorums.view_change_done.is_reached(count)

    def _do_view_change_by_future_vcd(self, view_no) -> bool:
        ind_count = len(self._next_view_indications[view_no])
        if self._quorum_is_reached(ind_count):
            logger.display('{}{} starting view change for {} after {} view change '
                           'indications from other nodes'.format(VIEW_CHANGE_PREFIX, self, view_no, ind_count))
            self.start_view_change(view_no)
            return True
        return False

    def _canViewChange(self, proposedViewNo: int) -> (bool, str):
        """
        Return whether there's quorum for view change for the proposed view
        number and its view is less than or equal to the proposed view
        """
        msg = None
        quorum = self.quorums.view_change.value
        if not self.instance_changes.has_quorum(proposedViewNo, quorum):
            msg = '{} has no quorum for view {}'.format(self, proposedViewNo)
        elif not proposedViewNo > self.view_no:
            msg = '{} is in higher view more than {}'.format(
                self, proposedViewNo)

        return not bool(msg), msg

    def start_view_change(self, proposed_view_no: int, continue_vc=False):
        """
        Trigger the view change process.

        :param proposed_view_no: the new view number after view change.
        """
        # TODO: consider moving this to pool manager
        # TODO: view change is a special case, which can have different
        # implementations - we need to make this logic pluggable

        if self.pre_vc_strategy and (not continue_vc):
            self.pre_view_change_in_progress = True
            self.pre_vc_strategy.prepare_view_change(proposed_view_no)
            return
        elif self.pre_vc_strategy:
            self.pre_vc_strategy.on_strategy_complete()

        self.previous_view_no = self.view_no
        self.view_no = proposed_view_no
        self.pre_view_change_in_progress = False
        self.view_change_in_progress = True
        self.previous_master_primary = self.provider.current_primary_name()
        self.set_defaults()
        self._process_vcd_for_future_view()

        self.provider.notify_view_change_start()
        self.provider.start_catchup()

    def _process_vcd_for_future_view(self):
        # make sure that all received VCD messages for future view
        # (including the current view) are stored, as they will be needed for a quorum
        # to finish the View Change and start selection.
        # This is especially critical for Propagate Primary mode (on receiving CURRENT_STATE by a new node).
        if self.view_no in self._next_view_indications:
            for frm, vcd in self._next_view_indications[self.view_no].items():
                # we call _on_verified_view_change_done_msg, not process_vchd_msg,
                # since we may be in propagate primary mode where some of validation inside process_vchd_msg
                # is not correct (for example, checking that the new primary differs from the current one)
                self._on_verified_view_change_done_msg(vcd, frm)

        # remove all for previous views
        for view_no in tuple(self._next_view_indications.keys()):
            if view_no <= self.view_no:
                del self._next_view_indications[view_no]

    def _on_verified_view_change_done_msg(self, msg, frm):
        new_primary_name = msg.name
        ledger_summary = msg.ledgerInfo

        # TODO what is the case when node sends several different
        # view change done messages
        data = (new_primary_name, ledger_summary)
        self._view_change_done[frm] = data

    def _start_selection(self):

        error = None

        if not self.provider.is_node_synced():
            error = "mode is {}".format(self.provider.node_mode())
        elif not self.has_acceptable_view_change_quorum:
            error = "has no view change quorum or no message from next primary"
        else:
            rv = self.get_sufficient_same_view_change_done_messages()
            if rv is None:
                error = "there are not sufficient same ViewChangeDone messages"
            elif not self._verify_primary(*rv):
                error = "failed to verify primary"

        if error is not None:
            logger.info('{} cannot start primary selection because {}'.format(self, error))
            return

        if self.is_behind_for_view:
            logger.info('{} is synced and has an acceptable view change quorum '
                        'but is behind the accepted state'.format(self))
            self.provider.start_catchup()
            return

        self.provider.select_primaries()

        if self.view_change_in_progress:
            self.view_change_in_progress = False
            self.provider.notify_view_change_complete()
            # when we had INSTANCE_CHANGE message, they added into instanceChanges
            # by msg.view_no. When view change was occured and view_no is changed,
            # then we should delete all INSTANCE_CHANGE messages with current (already changed)
            # view_no (which used in corresponded INSTANCE_CHANGE messages)
            # Therefore we delete all INSTANCE_CHANGE messages from previous and current view number
            self.instance_changes.remove_view(self.view_no)
            self.previous_view_no = None
            self.previous_master_primary = None

    def set_defaults(self):
        # Tracks view change done message
        self._view_change_done = {}  # replica name -> data

        # Set when an appropriate view change quorum is found which has
        # sufficient same ViewChangeDone messages
        self._primary_verified = False

        self._has_view_change_from_primary = False

        self._has_acceptable_view_change_quorum = False

        self._accepted_view_change_done_message = None

    def get_sufficient_same_view_change_done_messages(self) -> Optional[Tuple]:
        # Returns whether has a quorum of ViewChangeDone messages that are same
        # TODO: Does not look like optimal implementation.
        if self._accepted_view_change_done_message is None and \
                self._view_change_done:
            votes = self._view_change_done.values()
            votes = [(nm, tuple(tuple(i) for i in info)) for nm, info in votes]
            (new_primary, ledger_info), vote_count = mostCommonElement(votes)
            if vote_count >= self.quorum:
                logger.info('{} found acceptable primary {} and ledger info {}'.
                            format(self, new_primary, ledger_info))
                self._accepted_view_change_done_message = (new_primary,
                                                           ledger_info)
            else:
                logger.info('{} does not have acceptable primary, only {} votes for {}'.
                            format(self, vote_count, (new_primary, ledger_info)))

        return self._accepted_view_change_done_message

    def _verify_primary(self, new_primary, ledger_info):
        """
        This method is called when sufficient number of ViewChangeDone
        received and makes steps to switch to the new primary
        """
        expected_primary = self.provider.next_primary_name()
        if new_primary != expected_primary:
            logger.error("{}{} expected next primary to be {}, but majority "
                         "declared {} instead for view {}"
                         .format(PRIMARY_SELECTION_PREFIX, self.name,
                                 expected_primary, new_primary, self.view_no))
            return False

        self._primary_verified = True
        return True
        # TODO: check if ledger status is expected

    def _send_view_change_done_message(self):
        """
        Sends ViewChangeDone message to other protocol participants
        """
        new_primary_name = self.provider.next_primary_name()
        ledger_summary = self.provider.ledger_summary()
        message = ViewChangeDone(self.view_no,
                                 new_primary_name,
                                 ledger_summary)

        logger.info("{} is sending ViewChangeDone msg to all : {}".format(self, message))

        self.send(message)
        self._on_verified_view_change_done_msg(message, self.name)

    # overridden method of PrimaryDecider
    def get_msgs_for_lagged_nodes(self) -> List[ViewChangeDone]:
        # Should not return a list, only done for compatibility with interface
        """
        Returns the last accepted `ViewChangeDone` message.
        If no view change has happened returns ViewChangeDone
        with view no 0 to a newly joined node
        """
        # TODO: Consider a case where more than one node joins immediately,
        # then one of the node might not have an accepted
        # ViewChangeDone message
        messages = []
        accepted = self._accepted_view_change_done_message
        if accepted:
            messages.append(ViewChangeDone(self.last_completed_view_no, *accepted))
        elif self.name in self._view_change_done:
            messages.append(ViewChangeDone(self.last_completed_view_no,
                                           *self._view_change_done[self.name]))
        else:
            logger.info('{} has no ViewChangeDone message to send for view {}'.
                        format(self, self.view_no))
        return messages

    def propose_view_change(self, suspicion=Suspicions.PRIMARY_DEGRADED):
        proposed_view_no = self.view_no
        # TODO: For some reason not incrementing view_no in most cases leads to lots of failing/flaky tests
        # if suspicion == Suspicions.INSTANCE_CHANGE_TIMEOUT or not self.view_change_in_progress:
        if suspicion != Suspicions.STATE_SIGS_ARE_NOT_UPDATED or not self.view_change_in_progress:
            proposed_view_no += 1
        self.sendInstanceChange(proposed_view_no, suspicion)
        return proposed_view_no

    def is_state_fresh_enough(self):
        threshold = self.config.ACCEPTABLE_FRESHNESS_INTERVALS_COUNT * self.config.STATE_FRESHNESS_UPDATE_INTERVAL
        return self.provider.state_freshness() < threshold or (not self.view_change_in_progress and
                                                               not Mode.is_done_syncing(self.provider.node_mode()))
Beispiel #7
0
class ViewChanger():

    def __init__(self, provider: ViewChangerDataProvider, timer: TimerService):
        self.provider = provider
        self._timer = timer

        self.inBox = deque()
        self.outBox = deque()
        self.inBoxRouter = Router(
            (InstanceChange, self.process_instance_change_msg)
        )

        self.instance_changes = InstanceChangeProvider(self.config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL,
                                                       node_status_db=self.provider.node_status_db)

        self.previous_view_no = None

        # Action for _schedule instanceChange messages
        self.instance_change_action = None

        # Count of instance change rounds
        self.instance_change_rounds = 0

        # Time for view_change_starting
        self.start_view_change_ts = 0

        # Force periodic view change if enabled in config
        force_view_change_freq = self.config.ForceViewChangeFreq
        if force_view_change_freq > 0:
            RepeatingTimer(self._timer, force_view_change_freq, self.on_master_degradation)

        # Start periodic freshness check
        state_freshness_update_interval = self.config.STATE_FRESHNESS_UPDATE_INTERVAL
        if state_freshness_update_interval > 0:
            RepeatingTimer(self._timer, state_freshness_update_interval, self.check_freshness)

    def __repr__(self):
        return "{}".format(self.name)

    # PROPERTIES

    @property
    def view_no(self):
        return self.provider.view_no()

    @property
    def name(self) -> str:
        return self.provider.name()

    @property
    def config(self) -> object:
        return self.provider.config()

    @property
    def quorums(self) -> Quorums:
        return self.provider.quorums()

    @property
    def view_change_in_progress(self) -> bool:
        return self.provider.view_change_in_progress()

    @property
    def quorum(self) -> int:
        return self.quorums.view_change_done.value

    # __ PROPERTIES __

    # EXTERNAL EVENTS

    def on_master_degradation(self):
        self.propose_view_change()

    def check_freshness(self):
        if self.is_state_fresh_enough():
            logger.debug("{} not sending instance change because found state to be fresh enough".format(self))
            return
        self.propose_view_change(Suspicions.STATE_SIGS_ARE_NOT_UPDATED)

    def send_instance_change_if_needed(self, proposed_view_no, reason):
        can, whyNot = self._canViewChange(proposed_view_no)
        # if scheduled action will be awakened after view change completed,
        # then this action must be stopped also.
        if not can and self.view_no < proposed_view_no and self.provider.is_primary_disconnected():
            # Resend the same instance change message if we are not archive
            # InstanceChange quorum
            logger.info("Resend instance change message to all recipients")
            self.sendInstanceChange(proposed_view_no, reason)
            self._timer.schedule(self.config.NEW_VIEW_TIMEOUT,
                                 self.instance_change_action)
            logger.info("Count of rounds without quorum of "
                        "instance change messages: {}".format(self.instance_change_rounds))
            self.instance_change_rounds += 1
        else:
            # ViewChange procedure was started, therefore stop scheduling
            # resending instanceChange messages
            logger.info("Stop scheduling instance change resending")
            self._timer.cancel(self.instance_change_action)
            self.instance_change_action = None
            self.instance_change_rounds = 0

    def on_primary_loss(self):
        view_no = self.propose_view_change(Suspicions.PRIMARY_DISCONNECTED)
        if self.instance_change_action:
            # It's an action, scheduled for previous instanceChange round
            logger.info("Stop previous instance change resending schedule")
            self._timer.cancel(self.instance_change_action)
            self.instance_change_rounds = 0
        self.instance_change_action = partial(self.send_instance_change_if_needed,
                                              view_no,
                                              Suspicions.PRIMARY_DISCONNECTED)
        self._timer.schedule(self.config.NEW_VIEW_TIMEOUT,
                             self.instance_change_action)

    # TODO we have `on_primary_loss`, do we need that one?
    def on_primary_about_to_be_disconnected(self):
        self.propose_view_change(Suspicions.PRIMARY_ABOUT_TO_BE_DISCONNECTED)

    def on_suspicious_primary(self, suspicion: Suspicions):
        self.propose_view_change(suspicion)

    def on_node_count_changed(self):
        self.propose_view_change(Suspicions.NODE_COUNT_CHANGED)

    # __ EXTERNAL EVENTS __

    def process_instance_change_msg(self, instChg: InstanceChange, frm: str) -> None:
        """
        Validate and process an instance change request.

        :param instChg: the instance change request
        :param frm: the name of the node that sent this `msg`
        """
        if frm not in self.provider.connected_nodes():
            self.provider.discard(
                instChg,
                "received instance change request: {} from {} "
                "which is not in connected list: {}".format(
                    instChg, frm, self.provider.connected_nodes()), logger.info)
            return

        logger.info("{} received instance change request: {} from {}".format(self, instChg, frm))

        # TODO: add sender to blacklist?
        if not isinstance(instChg.viewNo, int):
            self.provider.discard(
                instChg, "{}field view_no has incorrect type: {}".format(
                    VIEW_CHANGE_PREFIX, type(instChg.viewNo)))
        elif instChg.viewNo <= self.view_no:
            self.provider.discard(
                instChg,
                "Received instance change request with view no {} "
                "which is not more than its view no {}".format(
                    instChg.viewNo, self.view_no), logger.info)
        else:
            # Record instance changes for views but send instance change
            # only when found master to be degraded. if quorum of view changes
            #  found then change view even if master not degraded
            self._on_verified_instance_change_msg(instChg, frm)

            if self.instance_changes.has_inst_chng_from(instChg.viewNo, self.name):
                logger.info("{} received instance change message {} but has already "
                            "sent an instance change message".format(self, instChg))
            elif not self.provider.is_master_degraded():
                logger.info("{} received instance change message {} but did not "
                            "find the master to be slow".format(self, instChg))
            else:
                logger.display("{}{} found master degraded after receiving instance change"
                               " message from {}".format(VIEW_CHANGE_PREFIX, self, frm))
                self.sendInstanceChange(instChg.viewNo)

    def send(self, msg):
        """
        Send a message to the node.

        :param msg: the message to send
        """
        logger.debug("{}'s view_changer sending {}".format(self.name, msg))
        self.outBox.append(msg)

    async def serviceQueues(self, limit=None) -> int:
        """
        Service at most `limit` messages from the inBox.

        :param limit: the maximum number of messages to service
        :return: the number of messages successfully processed
        """
        # do not start any view changes until catch-up is finished!
        if not Mode.is_done_syncing(self.provider.node_mode()):
            return 0
        return await self.inBoxRouter.handleAll(self.inBox, limit)

    def sendInstanceChange(self, view_no: int,
                           suspicion=Suspicions.PRIMARY_DEGRADED):
        """
        Broadcast an instance change request to all the remaining nodes

        :param view_no: the view number when the instance change is requested
        """

        # If not found any sent instance change messages in last
        # `ViewChangeWindowSize` seconds or the last sent instance change
        # message was sent long enough ago then instance change message can be
        # sent otherwise no.

        logger.info(
            "{}{} sending an instance change with view_no {}"
            " since {}".format(
                VIEW_CHANGE_PREFIX,
                self,
                view_no,
                suspicion.reason))
        logger.info("{}{} metrics for monitor: {}"
                    .format(MONITORING_PREFIX, self,
                            self.provider.pretty_metrics()))
        msg = self._create_instance_change_msg(view_no, suspicion.code)
        self.send(msg)
        # record instance change vote for self and try to change the view
        # if quorum is reached
        self._on_verified_instance_change_msg(msg, self.name)

    def _create_instance_change_msg(self, view_no, suspicion_code):
        return InstanceChange(view_no, suspicion_code)

    def _on_verified_instance_change_msg(self, msg, frm):
        view_no = msg.viewNo

        if not self.instance_changes.has_inst_chng_from(view_no, frm):
            self.instance_changes.add_vote(msg, frm)
            if view_no > self.view_no:
                self._start_view_change_by_instance_change(view_no)

    def _start_view_change_by_instance_change(self, view_no):
        # TODO: Need to handle skewed distributions which can arise due to
        # malicious nodes sending messages early on
        can, whyNot = self._canViewChange(view_no)
        if can:
            logger.display("{}{} initiating a view change to {} from {}".
                           format(VIEW_CHANGE_PREFIX, self, view_no, self.view_no))
            self.start_view_change(view_no)
        else:
            logger.info(whyNot)
        return can

    def _canViewChange(self, proposedViewNo: int) -> (bool, str):
        """
        Return whether there's quorum for view change for the proposed view
        number and its view is less than or equal to the proposed view
        """
        msg = None
        quorum = self.quorums.view_change.value
        if not self.instance_changes.has_quorum(proposedViewNo, quorum):
            msg = '{} has no quorum for view {}'.format(self, proposedViewNo)
        elif not proposedViewNo > self.view_no:
            msg = '{} is in higher view more than {}'.format(
                self, proposedViewNo)

        return not bool(msg), msg

    def start_view_change(self, proposed_view_no: int, continue_vc=False):
        """
        Trigger the view change process.

        :param proposed_view_no: the new view number after view change.
        """
        self.previous_view_no = self.view_no

        self.provider.notify_view_change_start()
        self.provider.start_view_change(proposed_view_no)

    # TODO: Check whether these still need to be called somewhere after view change:
    #  - self.instance_changes.remove_view(self.view_no)

    def propose_view_change(self, suspicion=Suspicions.PRIMARY_DEGRADED):
        proposed_view_no = self.view_no
        if suspicion != Suspicions.STATE_SIGS_ARE_NOT_UPDATED or not self.view_change_in_progress:
            proposed_view_no += 1
        self.sendInstanceChange(proposed_view_no, suspicion)
        return proposed_view_no

    def is_state_fresh_enough(self):
        threshold = self.config.ACCEPTABLE_FRESHNESS_INTERVALS_COUNT * self.config.STATE_FRESHNESS_UPDATE_INTERVAL
        return self.provider.state_freshness() < threshold or (not self.view_change_in_progress and
                                                               not Mode.is_done_syncing(self.provider.node_mode()))
Beispiel #8
0
class ViewChangeTriggerService:
    def __init__(self,
                 data: ConsensusSharedData,
                 timer: TimerService,
                 bus: InternalBus,
                 network: ExternalBus,
                 db_manager: DatabaseManager,
                 stasher: StashingRouter,
                 is_master_degraded: Callable[[], bool],
                 metrics: MetricsCollector = NullMetricsCollector()):
        self._data = data
        self._timer = timer
        self._bus = bus
        self._network = network
        self._stasher = stasher
        self._is_master_degraded = is_master_degraded
        self.metrics = metrics

        self._config = getConfig()

        self._instance_changes = \
            InstanceChangeProvider(outdated_ic_interval=self._config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL,
                                   node_status_db=db_manager.get_store(NODE_STATUS_DB_LABEL),
                                   time_provider=timer.get_current_time)

        self._subscription = Subscription()
        self._subscription.subscribe(bus, VoteForViewChange, self.process_vote_for_view_change)
        self._subscription.subscribe(bus, NewViewAccepted, self.process_new_view_accepted)
        self._subscription.subscribe(stasher, InstanceChange, self.process_instance_change)

    def cleanup(self):
        self._subscription.unsubscribe_all()

    @property
    def name(self):
        return replica_name_to_node_name(self._data.name)

    def __repr__(self):
        return self.name

    def process_vote_for_view_change(self, msg: VoteForViewChange):
        proposed_view_no = self._data.view_no
        # TODO: Some time ago it was proposed that view_no should not be increased during proposal
        #  if view change is already in progress, unless suspicion code is "view change is taking too long".
        #  Idea was to improve stability of view change triggering, however for some reason this change lead
        #  to lots of failing/flaky tests. This still needs to be investigated.
        # if suspicion == Suspicions.INSTANCE_CHANGE_TIMEOUT or not self.view_change_in_progress:
        if msg.suspicion != Suspicions.STATE_SIGS_ARE_NOT_UPDATED or not self._data.waiting_for_new_view:
            proposed_view_no += 1
        self._send_instance_change(proposed_view_no, msg.suspicion)

    def process_instance_change(self, msg: InstanceChange, frm: str):
        frm = replica_name_to_node_name(frm)

        # TODO: Do we really need this?
        if frm not in self._network.connecteds:
            return DISCARD, "instance change request: {} from {} which is not in connected list: {}".\
                format(msg, frm, self._network.connecteds)

        if not self._data.is_participating:
            return STASH_CATCH_UP, CATCHING_UP

        logger.info("{} received instance change request: {} from {}".format(self, msg, frm))

        if msg.viewNo <= self._data.view_no:
            return DISCARD, "instance change request with view no {} which is not more than its view no {}".\
                format(msg.viewNo, self._data.view_no)

        # Record instance changes for views but send instance change
        # only when found master to be degraded. if quorum of view changes
        #  found then change view even if master not degraded
        self._on_verified_instance_change_msg(msg, frm)

        if self._instance_changes.has_inst_chng_from(msg.viewNo, self.name):
            logger.info("{} received instance change message {} "
                        "but has already sent an instance change message".format(self, msg))
        elif not self._is_master_degraded():
            logger.info("{} received instance change message {} "
                        "but did not find the master to be slow".format(self, msg))
        else:
            logger.display("{}{} found master degraded after "
                           "receiving instance change message from {}".format(VIEW_CHANGE_PREFIX, self, frm))
            self._send_instance_change(msg.viewNo, Suspicions.PRIMARY_DEGRADED)

    def process_new_view_accepted(self, msg: NewViewAccepted):
        self._instance_changes.remove_view(self._data.view_no)

    def _send_instance_change(self, view_no: int, suspicion: Suspicion):
        logger.info("{}{} sending an instance change with view_no {} since {}".
                    format(VIEW_CHANGE_PREFIX, self, view_no, suspicion.reason))
        msg = InstanceChange(view_no, suspicion.code)
        self._network.send(msg)
        # record instance change vote for self and try to change the view if quorum is reached
        self._on_verified_instance_change_msg(msg, self.name)

    def _on_verified_instance_change_msg(self, msg: InstanceChange, frm: str):
        view_no = msg.viewNo

        if not self._instance_changes.has_inst_chng_from(view_no, frm):
            self._instance_changes.add_vote(msg, frm)
            if view_no > self._data.view_no:
                self._try_start_view_change_by_instance_change(view_no)

    def _try_start_view_change_by_instance_change(self, proposed_view_no: int) -> bool:
        # TODO: Need to handle skewed distributions which can arise due to
        #  malicious nodes sending messages early on
        can, why_not = self._can_view_change(proposed_view_no)
        if can:
            logger.display("{}{} initiating a view change to {} from {}".
                           format(VIEW_CHANGE_PREFIX, self, proposed_view_no, self._data.view_no))
            self._bus.send(NodeNeedViewChange(view_no=proposed_view_no))
        else:
            logger.info(why_not)
        return can

    def _can_view_change(self, proposed_view_no: int) -> (bool, str):
        quorum = self._data.quorums.view_change.value
        if not self._instance_changes.has_quorum(proposed_view_no, quorum):
            return False, '{} has no quorum for view {}'.format(self, proposed_view_no)
        if not proposed_view_no > self._data.view_no:
            return False, '{} is in higher view more than {}'.format(self, proposed_view_no)
        return True, ''
def instance_change_provider(tconf, node_status_db, time_provider):
    return InstanceChangeProvider(
        tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db,
        time_provider)