def createMonitor():
    return AccumulatingMonitorStrategy(
        start_time=START_TIME,
        instances=2,
        txn_delta_k=ACC_MONITOR_TXN_DELTA_K,
        timeout=ACC_MONITOR_TIMEOUT,
        input_rate_reaction_half_time=ACC_MONITOR_INPUT_RATE_REACTION_HALF_TIME
    )
def test_acc_monitor_detects_backup_slowdown(monitor: AccumulatingMonitorStrategy):
    # Simulate txns receiving and ordering
    triggered_time = simulate_ordering(monitor, 100.0, 0)

    # Monitor should not be triggered before timeout
    monitor.update_time(triggered_time + 0.8 * ACC_MONITOR_TIMEOUT)
    assert not monitor.is_master_degraded()
    assert not monitor.is_instance_degraded(1)

    # Monitor should be triggered on backup after timeout
    monitor.update_time(triggered_time + 1.2 * ACC_MONITOR_TIMEOUT)
    assert not monitor.is_master_degraded()
    assert monitor.is_instance_degraded(1)
Beispiel #3
0
    def __init__(self, name: str, Delta: float, Lambda: float, Omega: float,
                 instances: Instances, nodestack,
                 blacklister: Blacklister, nodeInfo: Dict,
                 notifierEventTriggeringConfig: Dict,
                 pluginPaths: Iterable[str] = None,
                 notifierEventsEnabled: bool = True):
        self.name = name
        self.instances = instances
        self.nodestack = nodestack
        self.blacklister = blacklister
        self.nodeInfo = nodeInfo
        self.notifierEventTriggeringConfig = notifierEventTriggeringConfig
        self.notifierEventsEnabled = notifierEventsEnabled

        self.Delta = Delta
        self.Lambda = Lambda
        self.Omega = Omega
        self.statsConsumers = self.getPluginsByType(pluginPaths,
                                                    PLUGIN_TYPE_STATS_CONSUMER)

        self.config = getConfig()

        # Number of ordered requests by each replica. The value at key `i` in
        # the dict is a tuple of the number of ordered requests by replica and
        # the time taken to order those requests by the replica of the `i`th
        # protocol instance
        self.numOrderedRequests = dict()  # type: Dict[int, Tuple[int, int]]

        # Dict(instance_id, throughput) of throughputs for replicas. Key is a instId and value is a instance of
        # ThroughputMeasurement class and provide throughputs evaluating mechanism
        self.throughputs = dict()   # type: Dict[int, ThroughputMeasurement]

        # Utility object for tracking requests order start and end
        # TODO: Has very similar cleanup logic to propagator.Requests
        self.requestTracker = RequestTimeTracker(instances.ids)

        # Request latencies for the master protocol instances. Key of the
        # dictionary is a tuple of client id and request id and the value is
        # the time the master instance took for ordering it
        self.masterReqLatencies = {}  # type: Dict[Tuple[str, int], float]

        # Indicates that request latency in previous snapshot of master req
        # latencies was too high
        self.masterReqLatencyTooHigh = False

        # Request latency(time taken to be ordered) for the client. The value
        # at key `i` in the dict is the LatencyMeasurement object which accumulate
        # average latency and total request for each client.
        self.clientAvgReqLatencies = dict()  # type: Dict[int, LatencyMeasurement]

        # TODO: Set this if this monitor belongs to a node which has primary
        # of master. Will be used to set `totalRequests`
        self.hasMasterPrimary = None

        # Total requests that have been ordered since the node started
        self.totalRequests = 0

        self.started = datetime.utcnow().isoformat()

        self.orderedRequestsInLast = []

        # attention: handlers will work over unordered request only once
        self.unordered_requests_handlers = []  # type: List[Callable]

        # Monitoring suspicious spikes in cluster throughput
        self.clusterThroughputSpikeMonitorData = {
            'value': 0,
            'cnt': 0,
            'accum': []
        }

        psutil.cpu_percent(interval=None)
        self.lastKnownTraffic = self.calculateTraffic()

        self.totalViewChanges = 0
        self._lastPostedViewChange = 0
        HasActionQueue.__init__(self)

        if self.config.SendMonitorStats:
            self.startRepeating(self.sendPeriodicStats,
                                self.config.DashboardUpdateFreq)

        self.startRepeating(
            self.checkPerformance,
            self.config.notifierEventTriggeringConfig['clusterThroughputSpike']['freq'])

        self.startRepeating(self.check_unordered, self.config.UnorderedCheckFreq)

        if 'disable_view_change' in self.config.unsafe:
            self.isMasterDegraded = lambda: False
        if 'disable_monitor' in self.config.unsafe:
            self.requestOrdered = lambda *args, **kwargs: {}
            self.sendPeriodicStats = lambda: None
            self.checkPerformance = lambda: None

        self.latency_avg_for_backup_cls = self.config.LatencyAveragingStrategyClass
        self.latency_measurement_cls = self.config.LatencyMeasurementCls
        self.throughput_avg_strategy_cls = self.config.throughput_averaging_strategy_class

        self.acc_monitor = None

        if self.config.ACC_MONITOR_ENABLED:
            self.acc_monitor = AccumulatingMonitorStrategy(
                start_time=time.perf_counter(),
                instances=instances.ids,
                txn_delta_k=self.config.ACC_MONITOR_TXN_DELTA_K,
                timeout=self.config.ACC_MONITOR_TIMEOUT,
                input_rate_reaction_half_time=self.config.ACC_MONITOR_INPUT_RATE_REACTION_HALF_TIME)
Beispiel #4
0
class Monitor(HasActionQueue, PluginLoaderHelper):
    """
    Implementation of RBFT's monitoring mechanism.

    The monitoring metrics are collected at the level of a node. Each node
    monitors the performance of each instance. Throughput of requests and
    latency per client request are measured.
    """

    def __init__(self, name: str, Delta: float, Lambda: float, Omega: float,
                 instances: Instances, nodestack,
                 blacklister: Blacklister, nodeInfo: Dict,
                 notifierEventTriggeringConfig: Dict,
                 pluginPaths: Iterable[str] = None,
                 notifierEventsEnabled: bool = True):
        self.name = name
        self.instances = instances
        self.nodestack = nodestack
        self.blacklister = blacklister
        self.nodeInfo = nodeInfo
        self.notifierEventTriggeringConfig = notifierEventTriggeringConfig
        self.notifierEventsEnabled = notifierEventsEnabled

        self.Delta = Delta
        self.Lambda = Lambda
        self.Omega = Omega
        self.statsConsumers = self.getPluginsByType(pluginPaths,
                                                    PLUGIN_TYPE_STATS_CONSUMER)

        self.config = getConfig()

        # Number of ordered requests by each replica. The value at key `i` in
        # the dict is a tuple of the number of ordered requests by replica and
        # the time taken to order those requests by the replica of the `i`th
        # protocol instance
        self.numOrderedRequests = dict()  # type: Dict[int, Tuple[int, int]]

        # Dict(instance_id, throughput) of throughputs for replicas. Key is a instId and value is a instance of
        # ThroughputMeasurement class and provide throughputs evaluating mechanism
        self.throughputs = dict()   # type: Dict[int, ThroughputMeasurement]

        # Utility object for tracking requests order start and end
        # TODO: Has very similar cleanup logic to propagator.Requests
        self.requestTracker = RequestTimeTracker(instances.ids)

        # Request latencies for the master protocol instances. Key of the
        # dictionary is a tuple of client id and request id and the value is
        # the time the master instance took for ordering it
        self.masterReqLatencies = {}  # type: Dict[Tuple[str, int], float]

        # Indicates that request latency in previous snapshot of master req
        # latencies was too high
        self.masterReqLatencyTooHigh = False

        # Request latency(time taken to be ordered) for the client. The value
        # at key `i` in the dict is the LatencyMeasurement object which accumulate
        # average latency and total request for each client.
        self.clientAvgReqLatencies = dict()  # type: Dict[int, LatencyMeasurement]

        # TODO: Set this if this monitor belongs to a node which has primary
        # of master. Will be used to set `totalRequests`
        self.hasMasterPrimary = None

        # Total requests that have been ordered since the node started
        self.totalRequests = 0

        self.started = datetime.utcnow().isoformat()

        self.orderedRequestsInLast = []

        # attention: handlers will work over unordered request only once
        self.unordered_requests_handlers = []  # type: List[Callable]

        # Monitoring suspicious spikes in cluster throughput
        self.clusterThroughputSpikeMonitorData = {
            'value': 0,
            'cnt': 0,
            'accum': []
        }

        psutil.cpu_percent(interval=None)
        self.lastKnownTraffic = self.calculateTraffic()

        self.totalViewChanges = 0
        self._lastPostedViewChange = 0
        HasActionQueue.__init__(self)

        if self.config.SendMonitorStats:
            self.startRepeating(self.sendPeriodicStats,
                                self.config.DashboardUpdateFreq)

        self.startRepeating(
            self.checkPerformance,
            self.config.notifierEventTriggeringConfig['clusterThroughputSpike']['freq'])

        self.startRepeating(self.check_unordered, self.config.UnorderedCheckFreq)

        if 'disable_view_change' in self.config.unsafe:
            self.isMasterDegraded = lambda: False
        if 'disable_monitor' in self.config.unsafe:
            self.requestOrdered = lambda *args, **kwargs: {}
            self.sendPeriodicStats = lambda: None
            self.checkPerformance = lambda: None

        self.latency_avg_for_backup_cls = self.config.LatencyAveragingStrategyClass
        self.latency_measurement_cls = self.config.LatencyMeasurementCls
        self.throughput_avg_strategy_cls = self.config.throughput_averaging_strategy_class

        self.acc_monitor = None

        if self.config.ACC_MONITOR_ENABLED:
            self.acc_monitor = AccumulatingMonitorStrategy(
                start_time=time.perf_counter(),
                instances=instances.ids,
                txn_delta_k=self.config.ACC_MONITOR_TXN_DELTA_K,
                timeout=self.config.ACC_MONITOR_TIMEOUT,
                input_rate_reaction_half_time=self.config.ACC_MONITOR_INPUT_RATE_REACTION_HALF_TIME)

    def __repr__(self):
        return self.name

    def metrics(self):
        """
        Calculate and return the metrics.
        """
        masterThrp, backupThrp = self.getThroughputs(self.instances.masterId)
        r = self.instance_throughput_ratio(self.instances.masterId)
        m = [
            ("{} Monitor metrics:".format(self), None),
            ("Delta", self.Delta),
            ("Lambda", self.Lambda),
            ("Omega", self.Omega),
            ("instances started", self.instances.started),
            ("ordered request counts",
             {i: r[0] for i, r in self.numOrderedRequests.items()}),
            ("ordered request durations",
             {i: r[1] for i, r in self.numOrderedRequests.items()}),
            ("master request latencies", self.masterReqLatencies),
            ("client avg request latencies", {i: self.getLatency(i)
                                              for i in self.instances.ids}),
            ("throughput", {i: self.getThroughput(i)
                            for i in self.instances.ids}),
            ("master throughput", masterThrp),
            ("total requests", self.totalRequests),
            ("avg backup throughput", backupThrp),
            ("master throughput ratio", r)]
        return m

    @property
    def prettymetrics(self) -> str:
        """
        Pretty printing for metrics
        """
        rendered = ["{}: {}".format(*m) for m in self.metrics()]
        return "\n            ".join(rendered)

    def calculateTraffic(self):
        currNetwork = psutil.net_io_counters()
        currNetwork = currNetwork.bytes_sent + currNetwork.bytes_recv
        currNetwork /= 1024
        return currNetwork

    @staticmethod
    def create_throughput_measurement(config, start_ts=time.perf_counter()):
        tm = config.throughput_measurement_class(
            **config.throughput_measurement_params)
        tm.init_time(start_ts)
        return tm

    def reset(self):
        """
        Reset the monitor. Sets all monitored values to defaults.
        """
        logger.debug("{}'s Monitor being reset".format(self))
        instances_ids = self.instances.started.keys()
        self.numOrderedRequests = {inst_id: (0, 0) for inst_id in instances_ids}
        self.requestTracker.reset()
        self.masterReqLatencies = {}
        self.masterReqLatencyTooHigh = False
        self.totalViewChanges += 1
        self.lastKnownTraffic = self.calculateTraffic()
        if self.acc_monitor:
            self.acc_monitor.reset()
        for i in instances_ids:
            rm = self.create_throughput_measurement(self.config)
            self.throughputs[i] = rm
            lm = self.latency_measurement_cls(self.config)
            self.clientAvgReqLatencies[i] = lm

    def addInstance(self, inst_id):
        """
        Add one protocol instance for monitoring.
        """
        self.instances.add(inst_id)
        self.requestTracker.add_instance(inst_id)
        self.numOrderedRequests[inst_id] = (0, 0)
        rm = self.create_throughput_measurement(self.config)

        self.throughputs[inst_id] = rm
        lm = self.latency_measurement_cls(self.config)
        self.clientAvgReqLatencies[inst_id] = lm
        if self.acc_monitor:
            self.acc_monitor.add_instance(inst_id)

    def removeInstance(self, inst_id):
        if self.acc_monitor:
            self.acc_monitor.remove_instance(inst_id)
        if self.instances.count > 0:
            self.instances.remove(inst_id)
            self.requestTracker.remove_instance(inst_id)
            self.numOrderedRequests.pop(inst_id, None)
            self.clientAvgReqLatencies.pop(inst_id, None)
            self.throughputs.pop(inst_id, None)

    def requestOrdered(self, reqIdrs: List[str], instId: int,
                       requests, byMaster: bool = False) -> Dict:
        """
        Measure the time taken for ordering of a request and return it. Monitor
        might have been reset due to view change due to which this method
        returns None
        """
        now = time.perf_counter()
        if self.acc_monitor:
            self.acc_monitor.update_time(now)
        durations = {}
        for key in reqIdrs:
            if key not in self.requestTracker:
                logger.debug("Got untracked ordered request with digest {}".
                             format(key))
                continue
            if self.acc_monitor:
                self.acc_monitor.request_ordered(key, instId)
            if key in self.requestTracker.handled_unordered():
                started = self.requestTracker.started(key)
                logger.info('Consensus for ReqId: {} was achieved by {}:{} in {} seconds.'
                            .format(key, self.name, instId, now - started))
            duration = self.requestTracker.order(instId, key, now)
            self.throughputs[instId].add_request(now)
            if byMaster:
                # TODO for now, view_change procedure can take more that 15 minutes
                # (5 minutes for catchup and 10 minutes for primary's answer).
                # Therefore, view_change triggering by max latency is not indicative now.
                # self.masterReqLatencies[key] = duration
                self.orderedRequestsInLast.append(now)

            if key in requests:
                identifier = requests[key].request.identifier
                self.clientAvgReqLatencies[instId].add_duration(identifier, duration)

            durations[key] = duration

        reqs, tm = self.numOrderedRequests[instId]
        orderedNow = len(durations)
        self.numOrderedRequests[instId] = (reqs + orderedNow,
                                           tm + sum(durations.values()))

        # TODO: Inefficient, as on every request a minimum of a large list is
        # calculated
        if min(r[0] for r in self.numOrderedRequests.values()) == (reqs + orderedNow):
            # If these requests is ordered by the last instance then increment
            # total requests, but why is this important, why cant is ordering
            # by master not enough?
            self.totalRequests += orderedNow
            self.postOnReqOrdered()
            if 0 == reqs:
                self.postOnNodeStarted(self.started)

        return durations

    def requestUnOrdered(self, key: str):
        """
        Record the time at which request ordering started.
        """
        now = time.perf_counter()
        if self.acc_monitor:
            self.acc_monitor.update_time(now)
            self.acc_monitor.request_received(key)
        self.requestTracker.start(key, now)

    def check_unordered(self):
        now = time.perf_counter()
        new_unordereds = [(req, now - started) for req, started in self.requestTracker.unhandled_unordered()
                          if now - started > self.config.UnorderedCheckFreq]
        if len(new_unordereds) == 0:
            return
        for handler in self.unordered_requests_handlers:
            handler(new_unordereds)
        for unordered in new_unordereds:
            self.requestTracker.handle(unordered[0])
            logger.debug('Following requests were not ordered for more than {} seconds: {}'
                         .format(self.config.UnorderedCheckFreq, unordered[0]))

    def isMasterDegraded(self):
        """
        Return whether the master instance is slow.
        """
        if self.acc_monitor:
            self.acc_monitor.update_time(time.perf_counter())
            return self.acc_monitor.is_master_degraded()
        else:
            return (self.instances.masterId is not None and
                    (self.isMasterThroughputTooLow() or
                     # TODO for now, view_change procedure can take more that 15 minutes
                     # (5 minutes for catchup and 10 minutes for primary's answer).
                     # Therefore, view_change triggering by max latency now is not indicative.
                     # self.isMasterReqLatencyTooHigh() or
                     self.isMasterAvgReqLatencyTooHigh()))

    def areBackupsDegraded(self):
        """
        Return slow instance.
        """
        slow_instances = []
        if self.acc_monitor:
            for instance in self.instances.backupIds:
                if self.acc_monitor.is_instance_degraded(instance):
                    slow_instances.append(instance)
        else:
            for instance in self.instances.backupIds:
                if self.is_instance_throughput_too_low(instance) or \
                        self.is_instance_avg_req_latency_too_high(instance):
                    slow_instances.append(instance)
        return slow_instances

    def instance_throughput_ratio(self, inst_id):
        """
        The relative throughput of an instance compared to the backup
        instances.
        """
        inst_thrp, otherThrp = self.getThroughputs(inst_id)

        # Backup throughput may be 0 so moving ahead only if it is not 0
        r = inst_thrp / otherThrp if otherThrp and inst_thrp is not None \
            else None
        return r

    def isMasterThroughputTooLow(self):
        """
        Return whether the throughput of the master instance is greater than the
        acceptable threshold
        """
        return self.is_instance_throughput_too_low(self.instances.masterId)

    def is_instance_throughput_too_low(self, inst_id):
        """
        Return whether the throughput of the master instance is greater than the
        acceptable threshold
        """
        logging = self.instances.masterId == inst_id
        r = self.instance_throughput_ratio(inst_id)
        if r is None:
            if logging:
                logger.debug("{} instance {} throughput is not "
                             "measurable.".format(self, inst_id))
            return None
        too_low = r < self.Delta
        if logging and too_low:
            logger.display("{}{} instance {} throughput ratio {} is lower than Delta {}.".
                           format(MONITORING_PREFIX, self, inst_id, r, self.Delta))
        elif logging:
            logger.trace("{} instance {} throughput ratio {} is acceptable.".
                         format(self, inst_id, r))
        return too_low

    def isMasterReqLatencyTooHigh(self):
        """
        Return whether the request latency of the master instance is greater
        than the acceptable threshold
        """
        # TODO for now, view_change procedure can take more that 15 minutes
        # (5 minutes for catchup and 10 minutes for primary's answer).
        # Therefore, view_change triggering by max latency is not indicative now.

        r = self.masterReqLatencyTooHigh or \
            next(((key, lat) for key, lat in self.masterReqLatencies.items() if
                  lat > self.Lambda), None)
        if r:
            logger.display("{}{} found master's latency {} to be higher than the threshold for request {}.".
                           format(MONITORING_PREFIX, self, r[1], r[0]))
        else:
            logger.trace("{} found master's latency to be lower than the "
                         "threshold for all requests.".format(self))
        return r

    def isMasterAvgReqLatencyTooHigh(self):
        """
        Return whether the average request latency of the master instance is
        greater than the acceptable threshold
        """
        return self.is_instance_avg_req_latency_too_high(self.instances.masterId)

    def is_instance_avg_req_latency_too_high(self, inst_id):
        """
        Return whether the average request latency of an instance is
        greater than the acceptable threshold
        """
        avg_lat, avg_lat_others = self.getLatencies()
        if not avg_lat or not avg_lat_others:
            return False

        d = avg_lat - avg_lat_others
        if d < self.Omega:
            return False

        if inst_id == self.instances.masterId:
            logger.info("{}{} found difference between master's and "
                        "backups's avg latency {} to be higher than the "
                        "threshold".format(MONITORING_PREFIX, self, d))
            logger.trace(
                "{}'s master's avg request latency is {} and backup's "
                "avg request latency is {}".format(self, avg_lat, avg_lat_others))
        return True

    def getThroughputs(self, desired_inst_id: int):
        """
        Return a tuple of  the throughput of the given instance and the average
        throughput of the remaining instances.

        :param instId: the id of the protocol instance
        """

        instance_thrp = self.getThroughput(desired_inst_id)
        totalReqs, totalTm = self.getInstanceMetrics(forAllExcept=desired_inst_id)
        # Average backup replica's throughput
        if len(self.throughputs) > 1:
            thrs = []
            for inst_id, thr_obj in self.throughputs.items():
                if inst_id != desired_inst_id:
                    thr = self.getThroughput(inst_id)
                    if thr is not None:
                        thrs.append(thr)
            if thrs:
                other_thrp = self.throughput_avg_strategy_cls.get_avg(thrs)
            else:
                other_thrp = None
        else:
            other_thrp = None
        if instance_thrp == 0:
            if self.numOrderedRequests[desired_inst_id] == (0, 0):
                avgReqsPerInst = (totalReqs or 0) / self.instances.count
                if avgReqsPerInst <= 1:
                    # too early to tell if we need an instance change
                    instance_thrp = None
        return instance_thrp, other_thrp

    def getThroughput(self, instId: int) -> float:
        """
        Return the throughput of the specified instance.

        :param instId: the id of the protocol instance
        """
        # We are using the instanceStarted time in the denominator instead of
        # a time interval. This is alright for now as all the instances on a
        # node are started at almost the same time.
        if instId not in self.instances.ids:
            return None
        perf_time = time.perf_counter()
        throughput = self.throughputs[instId].get_throughput(perf_time)
        return throughput

    def getInstanceMetrics(
            self, forAllExcept: int) -> Tuple[Optional[int], Optional[float]]:
        """
        Calculate and return the average throughput of all the instances except
        the one specified as `forAllExcept`.
        """
        m = [(reqs, tm) for i, (reqs, tm)
             in self.numOrderedRequests.items()
             if i != forAllExcept]
        if m:
            reqs, tm = zip(*m)
            return sum(reqs), sum(tm)
        else:
            return None, None

    def getLatencies(self, desired_inst_id=None):
        if desired_inst_id is None:
            desired_inst_id = self.instances.masterId
        avg_lat = self.getLatency(desired_inst_id)
        avg_lat_others_by_inst = []
        for inst_id in self.instances.ids:
            if desired_inst_id == inst_id:
                continue
            lat = self.getLatency(inst_id)
            if lat:
                avg_lat_others_by_inst.append(lat)
        avg_lat_others = self.latency_avg_for_backup_cls.get_avg(avg_lat_others_by_inst)\
            if avg_lat_others_by_inst else None
        return avg_lat, avg_lat_others

    def getLatency(self, instId: int) -> float:
        """
        Return a dict with client identifier as a key and calculated latency as a value
        """
        if len(self.clientAvgReqLatencies) == 0:
            return 0.0
        return self.clientAvgReqLatencies[instId].get_avg_latency()

    def sendPeriodicStats(self):
        thoughputData = self.sendThroughput()
        self.clusterThroughputSpikeMonitorData['accum'].append(
            thoughputData['throughput'])
        self.sendLatencies()
        self.sendKnownNodesInfo()
        self.sendNodeInfo()
        self.sendSystemPerfomanceInfo()
        self.sendTotalRequests()

    def checkPerformance(self):
        self.sendClusterThroughputSpike()

    def sendClusterThroughputSpike(self):
        if self.instances.masterId is None:
            return None
        accum = 0
        for val in self.clusterThroughputSpikeMonitorData['accum']:
            accum += val
        if len(self.clusterThroughputSpikeMonitorData['accum']):
            accum /= len(self.clusterThroughputSpikeMonitorData['accum'])
        self.clusterThroughputSpikeMonitorData['accum'] = []
        return pluginManager.sendMessageUponSuspiciousSpike(
            notifierPluginTriggerEvents['clusterThroughputSpike'],
            self.clusterThroughputSpikeMonitorData,
            accum,
            self.notifierEventTriggeringConfig['clusterThroughputSpike'],
            self.name,
            self.notifierEventsEnabled
        )

    @property
    def highResThroughput(self):
        # TODO:KS Move these computations as well to plenum-stats project
        return self.getThroughput(self.instances.masterId)

    def sendThroughput(self):
        logger.debug("{} sending throughput".format(self))

        throughput = self.highResThroughput
        utcTime = datetime.utcnow()
        mtrStats = {
            "throughput": throughput,
            "timestamp": utcTime.isoformat(),
            "nodeName": self.name,
            # Multiply by 1000 for JavaScript date conversion
            "time": time.mktime(utcTime.timetuple()) * 1000
        }
        self._sendStatsDataIfRequired(
            EVENT_PERIODIC_STATS_THROUGHPUT, mtrStats)
        return mtrStats

    @property
    def masterLatency(self):
        master_latency, _ = self.getLatencies()
        return master_latency

    @property
    def avgBackupLatency(self):
        _, lat_backup = self.getLatencies()
        return lat_backup

    def sendLatencies(self):
        logger.debug("{} sending latencies".format(self))
        utcTime = datetime.utcnow()
        # Multiply by 1000 to make it compatible to JavaScript Date()
        jsTime = time.mktime(utcTime.timetuple()) * 1000

        latencies = dict(
            masterLatency=self.masterLatency,
            averageBackupLatency=self.avgBackupLatency,
            time=jsTime,
            nodeName=self.name,
            timestamp=utcTime.isoformat()
        )

        self._sendStatsDataIfRequired(
            EVENT_PERIODIC_STATS_LATENCIES, latencies)

    def sendKnownNodesInfo(self):
        logger.debug("{} sending nodestack".format(self))
        self._sendStatsDataIfRequired(
            EVENT_PERIODIC_STATS_NODES, remotesInfo(
                self.nodestack, self.blacklister))

    def sendSystemPerfomanceInfo(self):
        logger.debug("{} sending system performance".format(self))
        self._sendStatsDataIfRequired(
            EVENT_PERIODIC_STATS_SYSTEM_PERFORMANCE_INFO,
            self.captureSystemPerformance())

    def sendNodeInfo(self):
        logger.debug("{} sending node info".format(self))
        self._sendStatsDataIfRequired(
            EVENT_PERIODIC_STATS_NODE_INFO, self.nodeInfo['data'])

    def sendTotalRequests(self):
        logger.debug("{} sending total requests".format(self))

        totalRequests = dict(
            totalRequests=self.totalRequests
        )

        self._sendStatsDataIfRequired(
            EVENT_PERIODIC_STATS_TOTAL_REQUESTS, totalRequests)

    def captureSystemPerformance(self):
        logger.debug("{} capturing system performance".format(self))
        timestamp = time.time()
        cpu = psutil.cpu_percent(interval=None)
        ram = psutil.virtual_memory()
        curr_network = self.calculateTraffic()
        network = curr_network - self.lastKnownTraffic
        self.lastKnownTraffic = curr_network
        cpu_data = {
            'time': timestamp,
            'value': cpu
        }
        ram_data = {
            'time': timestamp,
            'value': ram.percent
        }
        traffic_data = {
            'time': timestamp,
            'value': network
        }
        return {
            'cpu': cpu_data,
            'ram': ram_data,
            'traffic': traffic_data
        }

    def postOnReqOrdered(self):
        utcTime = datetime.utcnow()
        # Multiply by 1000 to make it compatible to JavaScript Date()
        jsTime = time.mktime(utcTime.timetuple()) * 1000

        if self.totalViewChanges != self._lastPostedViewChange:
            self._lastPostedViewChange = self.totalViewChanges
            viewChange = dict(
                time=jsTime,
                viewChange=self._lastPostedViewChange
            )
            self._sendStatsDataIfRequired(EVENT_VIEW_CHANGE, viewChange)

        reqOrderedEventDict = dict(self.metrics())
        reqOrderedEventDict["created_at"] = utcTime.isoformat()
        reqOrderedEventDict["nodeName"] = self.name
        reqOrderedEventDict["time"] = jsTime
        reqOrderedEventDict["hasMasterPrimary"] = "Y" if self.hasMasterPrimary else "N"
        self._sendStatsDataIfRequired(EVENT_REQ_ORDERED, reqOrderedEventDict)
        self._clearSnapshot()

    def postOnNodeStarted(self, startedAt):
        throughputData = {
            "updateFrequency": self.config.DashboardUpdateFreq,
            "graphDuration": self.config.ThroughputGraphDuration
        }
        startedAtData = {"startedAt": startedAt, "ctx": "DEMO"}
        startedEventDict = {
            "startedAtData": startedAtData,
            "throughputConfig": throughputData
        }
        self._sendStatsDataIfRequired(EVENT_NODE_STARTED, startedEventDict)

    def _clearSnapshot(self):
        self.masterReqLatencyTooHigh = self.isMasterReqLatencyTooHigh()
        self.masterReqLatencies = {}

    def _sendStatsDataIfRequired(self, event, stats):
        if self.config.SendMonitorStats:
            for sc in self.statsConsumers:
                sc.sendStats(event, stats)

    @staticmethod
    def mean(data):
        return 0 if len(data) == 0 else mean(data)
Beispiel #5
0
    def __init__(self, name: str, Delta: float, Lambda: float, Omega: float,
                 instances: Instances, nodestack,
                 blacklister: Blacklister, nodeInfo: Dict,
                 notifierEventTriggeringConfig: Dict,
                 pluginPaths: Iterable[str] = None,
                 notifierEventsEnabled: bool = True):
        self.name = name
        self.instances = instances
        self.nodestack = nodestack
        self.blacklister = blacklister
        self.nodeInfo = nodeInfo
        self.notifierEventTriggeringConfig = notifierEventTriggeringConfig
        self.notifierEventsEnabled = notifierEventsEnabled

        self.Delta = Delta
        self.Lambda = Lambda
        self.Omega = Omega
        self.statsConsumers = self.getPluginsByType(pluginPaths,
                                                    PLUGIN_TYPE_STATS_CONSUMER)

        self.config = getConfig()

        # Number of ordered requests by each replica. The value at key `i` in
        # the dict is a tuple of the number of ordered requests by replica and
        # the time taken to order those requests by the replica of the `i`th
        # protocol instance
        self.numOrderedRequests = dict()  # type: Dict[int, Tuple[int, int]]

        # Dict(instance_id, throughput) of throughputs for replicas. Key is a instId and value is a instance of
        # ThroughputMeasurement class and provide throughputs evaluating mechanism
        self.throughputs = dict()   # type: Dict[int, ThroughputMeasurement]

        # Utility object for tracking requests order start and end
        # TODO: Has very similar cleanup logic to propagator.Requests
        self.requestTracker = RequestTimeTracker(instances.ids)

        # Request latencies for the master protocol instances. Key of the
        # dictionary is a tuple of client id and request id and the value is
        # the time the master instance took for ordering it
        self.masterReqLatencies = {}  # type: Dict[Tuple[str, int], float]

        # Indicates that request latency in previous snapshot of master req
        # latencies was too high
        self.masterReqLatencyTooHigh = False

        # Request latency(time taken to be ordered) for the client. The value
        # at key `i` in the dict is the LatencyMeasurement object which accumulate
        # average latency and total request for each client.
        self.clientAvgReqLatencies = dict()  # type: Dict[int, LatencyMeasurement]

        # TODO: Set this if this monitor belongs to a node which has primary
        # of master. Will be used to set `totalRequests`
        self.hasMasterPrimary = None

        # Total requests that have been ordered since the node started
        self.totalRequests = 0

        self.started = datetime.utcnow().isoformat()

        # attention: handlers will work over unordered request only once
        self.unordered_requests_handlers = []  # type: List[Callable]

        # Monitoring suspicious spikes in cluster throughput
        self.clusterThroughputSpikeMonitorData = {
            'value': 0,
            'cnt': 0,
            'accum': []
        }

        psutil.cpu_percent(interval=None)
        self.lastKnownTraffic = self.calculateTraffic()

        self.totalViewChanges = 0
        self._lastPostedViewChange = 0
        HasActionQueue.__init__(self)

        if self.config.SendMonitorStats:
            self.startRepeating(self.sendPeriodicStats,
                                self.config.DashboardUpdateFreq)

        self.startRepeating(
            self.checkPerformance,
            self.config.notifierEventTriggeringConfig['clusterThroughputSpike']['freq'])

        self.startRepeating(self.check_unordered, self.config.UnorderedCheckFreq)

        if 'disable_view_change' in self.config.unsafe:
            self.isMasterDegraded = lambda: False
        if 'disable_monitor' in self.config.unsafe:
            self.requestOrdered = lambda *args, **kwargs: {}
            self.sendPeriodicStats = lambda: None
            self.checkPerformance = lambda: None

        self.latency_avg_for_backup_cls = self.config.LatencyAveragingStrategyClass
        self.latency_measurement_cls = self.config.LatencyMeasurementCls
        self.throughput_avg_strategy_cls = self.config.throughput_averaging_strategy_class

        self.acc_monitor = None

        if self.config.ACC_MONITOR_ENABLED:
            self.acc_monitor = AccumulatingMonitorStrategy(
                start_time=time.perf_counter(),
                instances=instances.ids,
                txn_delta_k=self.config.ACC_MONITOR_TXN_DELTA_K,
                timeout=self.config.ACC_MONITOR_TIMEOUT,
                input_rate_reaction_half_time=self.config.ACC_MONITOR_INPUT_RATE_REACTION_HALF_TIME)
Beispiel #6
0
class Monitor(HasActionQueue, PluginLoaderHelper):
    """
    Implementation of RBFT's monitoring mechanism.

    The monitoring metrics are collected at the level of a node. Each node
    monitors the performance of each instance. Throughput of requests and
    latency per client request are measured.
    """

    def __init__(self, name: str, Delta: float, Lambda: float, Omega: float,
                 instances: Instances, nodestack,
                 blacklister: Blacklister, nodeInfo: Dict,
                 notifierEventTriggeringConfig: Dict,
                 pluginPaths: Iterable[str] = None,
                 notifierEventsEnabled: bool = True):
        self.name = name
        self.instances = instances
        self.nodestack = nodestack
        self.blacklister = blacklister
        self.nodeInfo = nodeInfo
        self.notifierEventTriggeringConfig = notifierEventTriggeringConfig
        self.notifierEventsEnabled = notifierEventsEnabled

        self.Delta = Delta
        self.Lambda = Lambda
        self.Omega = Omega
        self.statsConsumers = self.getPluginsByType(pluginPaths,
                                                    PLUGIN_TYPE_STATS_CONSUMER)

        self.config = getConfig()

        # Number of ordered requests by each replica. The value at key `i` in
        # the dict is a tuple of the number of ordered requests by replica and
        # the time taken to order those requests by the replica of the `i`th
        # protocol instance
        self.numOrderedRequests = dict()  # type: Dict[int, Tuple[int, int]]

        # Dict(instance_id, throughput) of throughputs for replicas. Key is a instId and value is a instance of
        # ThroughputMeasurement class and provide throughputs evaluating mechanism
        self.throughputs = dict()   # type: Dict[int, ThroughputMeasurement]

        # Utility object for tracking requests order start and end
        # TODO: Has very similar cleanup logic to propagator.Requests
        self.requestTracker = RequestTimeTracker(instances.ids)

        # Request latencies for the master protocol instances. Key of the
        # dictionary is a tuple of client id and request id and the value is
        # the time the master instance took for ordering it
        self.masterReqLatencies = {}  # type: Dict[Tuple[str, int], float]

        # Indicates that request latency in previous snapshot of master req
        # latencies was too high
        self.masterReqLatencyTooHigh = False

        # Request latency(time taken to be ordered) for the client. The value
        # at key `i` in the dict is the LatencyMeasurement object which accumulate
        # average latency and total request for each client.
        self.clientAvgReqLatencies = dict()  # type: Dict[int, LatencyMeasurement]

        # TODO: Set this if this monitor belongs to a node which has primary
        # of master. Will be used to set `totalRequests`
        self.hasMasterPrimary = None

        # Total requests that have been ordered since the node started
        self.totalRequests = 0

        self.started = datetime.utcnow().isoformat()

        # attention: handlers will work over unordered request only once
        self.unordered_requests_handlers = []  # type: List[Callable]

        # Monitoring suspicious spikes in cluster throughput
        self.clusterThroughputSpikeMonitorData = {
            'value': 0,
            'cnt': 0,
            'accum': []
        }

        psutil.cpu_percent(interval=None)
        self.lastKnownTraffic = self.calculateTraffic()

        self.totalViewChanges = 0
        self._lastPostedViewChange = 0
        HasActionQueue.__init__(self)

        if self.config.SendMonitorStats:
            self.startRepeating(self.sendPeriodicStats,
                                self.config.DashboardUpdateFreq)

        self.startRepeating(
            self.checkPerformance,
            self.config.notifierEventTriggeringConfig['clusterThroughputSpike']['freq'])

        self.startRepeating(self.check_unordered, self.config.UnorderedCheckFreq)

        if 'disable_view_change' in self.config.unsafe:
            self.isMasterDegraded = lambda: False
        if 'disable_monitor' in self.config.unsafe:
            self.requestOrdered = lambda *args, **kwargs: {}
            self.sendPeriodicStats = lambda: None
            self.checkPerformance = lambda: None

        self.latency_avg_for_backup_cls = self.config.LatencyAveragingStrategyClass
        self.latency_measurement_cls = self.config.LatencyMeasurementCls
        self.throughput_avg_strategy_cls = self.config.throughput_averaging_strategy_class

        self.acc_monitor = None

        if self.config.ACC_MONITOR_ENABLED:
            self.acc_monitor = AccumulatingMonitorStrategy(
                start_time=time.perf_counter(),
                instances=instances.ids,
                txn_delta_k=self.config.ACC_MONITOR_TXN_DELTA_K,
                timeout=self.config.ACC_MONITOR_TIMEOUT,
                input_rate_reaction_half_time=self.config.ACC_MONITOR_INPUT_RATE_REACTION_HALF_TIME)

    def __repr__(self):
        return self.name

    def metrics(self):
        """
        Calculate and return the metrics.
        """
        masterThrp, backupThrp = self.getThroughputs(self.instances.masterId)
        r = self.instance_throughput_ratio(self.instances.masterId)
        m = [
            ("{} Monitor metrics:".format(self), None),
            ("Delta", self.Delta),
            ("Lambda", self.Lambda),
            ("Omega", self.Omega),
            ("instances started", self.instances.started),
            ("ordered request counts",
             {i: r[0] for i, r in self.numOrderedRequests.items()}),
            ("ordered request durations",
             {i: r[1] for i, r in self.numOrderedRequests.items()}),
            ("master request latencies", self.masterReqLatencies),
            ("client avg request latencies", {i: self.getLatency(i)
                                              for i in self.instances.ids}),
            ("throughput", {i: self.getThroughput(i)
                            for i in self.instances.ids}),
            ("master throughput", masterThrp),
            ("total requests", self.totalRequests),
            ("avg backup throughput", backupThrp),
            ("master throughput ratio", r)]
        return m

    @property
    def prettymetrics(self) -> str:
        """
        Pretty printing for metrics
        """
        rendered = ["{}: {}".format(*m) for m in self.metrics()]
        return "\n            ".join(rendered)

    def calculateTraffic(self):
        currNetwork = psutil.net_io_counters()
        currNetwork = currNetwork.bytes_sent + currNetwork.bytes_recv
        currNetwork /= 1024
        return currNetwork

    @staticmethod
    def create_throughput_measurement(config, start_ts=None):
        if start_ts is None:
            start_ts = time.perf_counter()
        tm = config.throughput_measurement_class(
            **config.throughput_measurement_params)
        tm.init_time(start_ts)
        logger.trace("Creating throughput measurement class {} with parameters {} in start time {}"
                     .format(str(config.throughput_measurement_class), str(config.throughput_measurement_params), start_ts))
        return tm

    def reset(self):
        """
        Reset the monitor. Sets all monitored values to defaults.
        """
        logger.debug("{}'s Monitor being reset".format(self))
        instances_ids = self.instances.started.keys()
        self.numOrderedRequests = {inst_id: (0, 0) for inst_id in instances_ids}
        self.requestTracker.reset()
        self.masterReqLatencies = {}
        self.masterReqLatencyTooHigh = False
        self.totalViewChanges += 1
        self.lastKnownTraffic = self.calculateTraffic()
        if self.acc_monitor:
            self.acc_monitor.reset()
        for i in instances_ids:
            rm = self.create_throughput_measurement(self.config)
            self.throughputs[i] = rm
            lm = self.latency_measurement_cls(self.config)
            self.clientAvgReqLatencies[i] = lm

    def addInstance(self, inst_id):
        """
        Add one protocol instance for monitoring.
        """
        self.instances.add(inst_id)
        self.requestTracker.add_instance(inst_id)
        self.numOrderedRequests[inst_id] = (0, 0)
        rm = self.create_throughput_measurement(self.config)

        self.throughputs[inst_id] = rm
        lm = self.latency_measurement_cls(self.config)
        self.clientAvgReqLatencies[inst_id] = lm
        if self.acc_monitor:
            self.acc_monitor.add_instance(inst_id)

    def removeInstance(self, inst_id):
        if self.acc_monitor:
            self.acc_monitor.remove_instance(inst_id)
        if self.instances.count > 0:
            self.instances.remove(inst_id)
            self.requestTracker.remove_instance(inst_id)
            self.numOrderedRequests.pop(inst_id, None)
            self.clientAvgReqLatencies.pop(inst_id, None)
            self.throughputs.pop(inst_id, None)

    def requestOrdered(self, reqIdrs: List[str], instId: int,
                       requests, byMaster: bool = False) -> Dict:
        """
        Measure the time taken for ordering of a request and return it. Monitor
        might have been reset due to view change due to which this method
        returns None
        """
        now = time.perf_counter()
        if self.acc_monitor:
            self.acc_monitor.update_time(now)
        durations = {}
        for key in reqIdrs:
            if key not in self.requestTracker:
                logger.debug("Got untracked ordered request with digest {}".
                             format(key))
                continue
            if self.acc_monitor:
                self.acc_monitor.request_ordered(key, instId)
            if key in self.requestTracker.handled_unordered():
                started = self.requestTracker.started(key)
                logger.info('Consensus for ReqId: {} was achieved by {}:{} in {} seconds.'
                            .format(key, self.name, instId, now - started))
            duration = self.requestTracker.order(instId, key, now)
            self.throughputs[instId].add_request(now)

            if key in requests:
                identifier = requests[key].request.identifier
                self.clientAvgReqLatencies[instId].add_duration(identifier, duration)

            durations[key] = duration

        reqs, tm = self.numOrderedRequests[instId]
        orderedNow = len(durations)
        self.numOrderedRequests[instId] = (reqs + orderedNow,
                                           tm + sum(durations.values()))

        # TODO: Inefficient, as on every request a minimum of a large list is
        # calculated
        if min(r[0] for r in self.numOrderedRequests.values()) == (reqs + orderedNow):
            # If these requests is ordered by the last instance then increment
            # total requests, but why is this important, why cant is ordering
            # by master not enough?
            self.totalRequests += orderedNow
            self.postOnReqOrdered()
            if 0 == reqs:
                self.postOnNodeStarted(self.started)

        return durations

    def requestUnOrdered(self, key: str):
        """
        Record the time at which request ordering started.
        """
        now = time.perf_counter()
        if self.acc_monitor:
            self.acc_monitor.update_time(now)
            self.acc_monitor.request_received(key)
        self.requestTracker.start(key, now)

    def check_unordered(self):
        now = time.perf_counter()
        new_unordereds = [(req, now - started) for req, started in self.requestTracker.unhandled_unordered()
                          if now - started > self.config.UnorderedCheckFreq]
        if len(new_unordereds) == 0:
            return
        for handler in self.unordered_requests_handlers:
            handler(new_unordereds)
        for unordered in new_unordereds:
            self.requestTracker.handle(unordered[0])
            logger.debug('Following requests were not ordered for more than {} seconds: {}'
                         .format(self.config.UnorderedCheckFreq, unordered[0]))

    def isMasterDegraded(self):
        """
        Return whether the master instance is slow.
        """
        if self.acc_monitor:
            self.acc_monitor.update_time(time.perf_counter())
            return self.acc_monitor.is_master_degraded()
        else:
            return (self.instances.masterId is not None and
                    (self.isMasterThroughputTooLow() or
                     # TODO for now, view_change procedure can take more that 15 minutes
                     # (5 minutes for catchup and 10 minutes for primary's answer).
                     # Therefore, view_change triggering by max latency now is not indicative.
                     # self.isMasterReqLatencyTooHigh() or
                     self.isMasterAvgReqLatencyTooHigh()))

    def areBackupsDegraded(self):
        """
        Return slow instance.
        """
        slow_instances = []
        if self.acc_monitor:
            for instance in self.instances.backupIds:
                if self.acc_monitor.is_instance_degraded(instance):
                    slow_instances.append(instance)
        else:
            for instance in self.instances.backupIds:
                if self.is_instance_throughput_too_low(instance):
                    slow_instances.append(instance)
        return slow_instances

    def instance_throughput_ratio(self, inst_id):
        """
        The relative throughput of an instance compared to the backup
        instances.
        """
        inst_thrp, otherThrp = self.getThroughputs(inst_id)

        # Backup throughput may be 0 so moving ahead only if it is not 0
        r = inst_thrp / otherThrp if otherThrp and inst_thrp is not None \
            else None
        return r

    def isMasterThroughputTooLow(self):
        """
        Return whether the throughput of the master instance is greater than the
        acceptable threshold
        """
        return self.is_instance_throughput_too_low(self.instances.masterId)

    def is_instance_throughput_too_low(self, inst_id):
        """
        Return whether the throughput of the master instance is greater than the
        acceptable threshold
        """
        r = self.instance_throughput_ratio(inst_id)
        if r is None:
            logger.debug("{} instance {} throughput is not "
                         "measurable.".format(self, inst_id))
            return None
        too_low = r < self.Delta
        if too_low:
            logger.display("{}{} instance {} throughput ratio {} is lower than Delta {}.".
                           format(MONITORING_PREFIX, self, inst_id, r, self.Delta))
        else:
            logger.trace("{} instance {} throughput ratio {} is acceptable.".
                         format(self, inst_id, r))
        return too_low

    def isMasterReqLatencyTooHigh(self):
        """
        Return whether the request latency of the master instance is greater
        than the acceptable threshold
        """
        # TODO for now, view_change procedure can take more that 15 minutes
        # (5 minutes for catchup and 10 minutes for primary's answer).
        # Therefore, view_change triggering by max latency is not indicative now.

        r = self.masterReqLatencyTooHigh or \
            next(((key, lat) for key, lat in self.masterReqLatencies.items() if
                  lat > self.Lambda), None)
        if r:
            logger.display("{}{} found master's latency {} to be higher than the threshold for request {}.".
                           format(MONITORING_PREFIX, self, r[1], r[0]))
        else:
            logger.trace("{} found master's latency to be lower than the "
                         "threshold for all requests.".format(self))
        return r

    def isMasterAvgReqLatencyTooHigh(self):
        """
        Return whether the average request latency of the master instance is
        greater than the acceptable threshold
        """
        return self.is_instance_avg_req_latency_too_high(self.instances.masterId)

    def is_instance_avg_req_latency_too_high(self, inst_id):
        """
        Return whether the average request latency of an instance is
        greater than the acceptable threshold
        """
        avg_lat, avg_lat_others = self.getLatencies()
        if not avg_lat or not avg_lat_others:
            return False

        d = avg_lat - avg_lat_others
        if d < self.Omega:
            return False

        if inst_id == self.instances.masterId:
            logger.info("{}{} found difference between master's and "
                        "backups's avg latency {} to be higher than the "
                        "threshold".format(MONITORING_PREFIX, self, d))
            logger.trace(
                "{}'s master's avg request latency is {} and backup's "
                "avg request latency is {}".format(self, avg_lat, avg_lat_others))
        return True

    def getThroughputs(self, desired_inst_id: int):
        """
        Return a tuple of  the throughput of the given instance and the average
        throughput of the remaining instances.

        :param instId: the id of the protocol instance
        """

        instance_thrp = self.getThroughput(desired_inst_id)
        totalReqs, totalTm = self.getInstanceMetrics(forAllExcept=desired_inst_id)
        # Average backup replica's throughput
        if len(self.throughputs) > 1:
            thrs = []
            for inst_id, thr_obj in self.throughputs.items():
                if inst_id == desired_inst_id:
                    continue
                thr = self.getThroughput(inst_id)
                if thr is not None:
                    thrs.append(thr)
            if thrs:
                other_thrp = self.throughput_avg_strategy_cls.get_avg(thrs)
            else:
                other_thrp = None
        else:
            other_thrp = None
        if instance_thrp == 0:
            if self.numOrderedRequests[desired_inst_id] == (0, 0):
                avgReqsPerInst = (totalReqs or 0) / self.instances.count
                if avgReqsPerInst <= 1:
                    # too early to tell if we need an instance change
                    instance_thrp = None
        return instance_thrp, other_thrp

    def getThroughput(self, instId: int) -> float:
        """
        Return the throughput of the specified instance.

        :param instId: the id of the protocol instance
        """
        # We are using the instanceStarted time in the denominator instead of
        # a time interval. This is alright for now as all the instances on a
        # node are started at almost the same time.
        if instId not in self.instances.ids:
            return None
        perf_time = time.perf_counter()
        throughput = self.throughputs[instId].get_throughput(perf_time)
        return throughput

    def getInstanceMetrics(
            self, forAllExcept: int) -> Tuple[Optional[int], Optional[float]]:
        """
        Calculate and return the average throughput of all the instances except
        the one specified as `forAllExcept`.
        """
        m = [(reqs, tm) for i, (reqs, tm)
             in self.numOrderedRequests.items()
             if i != forAllExcept]
        if m:
            reqs, tm = zip(*m)
            return sum(reqs), sum(tm)
        else:
            return None, None

    def getLatencies(self, desired_inst_id=None):
        if desired_inst_id is None:
            desired_inst_id = self.instances.masterId
        avg_lat = self.getLatency(desired_inst_id)
        avg_lat_others_by_inst = []
        for inst_id in self.instances.ids:
            if desired_inst_id == inst_id:
                continue
            lat = self.getLatency(inst_id)
            if lat:
                avg_lat_others_by_inst.append(lat)
        avg_lat_others = self.latency_avg_for_backup_cls.get_avg(avg_lat_others_by_inst)\
            if avg_lat_others_by_inst else None
        return avg_lat, avg_lat_others

    def getLatency(self, instId: int) -> float:
        """
        Return a dict with client identifier as a key and calculated latency as a value
        """
        if len(self.clientAvgReqLatencies) == 0:
            return 0.0
        return self.clientAvgReqLatencies[instId].get_avg_latency()

    def sendPeriodicStats(self):
        thoughputData = self.sendThroughput()
        self.clusterThroughputSpikeMonitorData['accum'].append(
            thoughputData['throughput'])
        self.sendLatencies()
        self.sendKnownNodesInfo()
        self.sendNodeInfo()
        self.sendSystemPerfomanceInfo()
        self.sendTotalRequests()

    def checkPerformance(self):
        self.sendClusterThroughputSpike()

    def sendClusterThroughputSpike(self):
        if self.instances.masterId is None:
            return None
        accum = 0
        for val in self.clusterThroughputSpikeMonitorData['accum']:
            accum += val
        if len(self.clusterThroughputSpikeMonitorData['accum']):
            accum /= len(self.clusterThroughputSpikeMonitorData['accum'])
        self.clusterThroughputSpikeMonitorData['accum'] = []
        return pluginManager.sendMessageUponSuspiciousSpike(
            notifierPluginTriggerEvents['clusterThroughputSpike'],
            self.clusterThroughputSpikeMonitorData,
            accum,
            self.notifierEventTriggeringConfig['clusterThroughputSpike'],
            self.name,
            self.notifierEventsEnabled
        )

    @property
    def highResThroughput(self):
        # TODO:KS Move these computations as well to plenum-stats project
        return self.getThroughput(self.instances.masterId)

    def sendThroughput(self):
        logger.debug("{} sending throughput".format(self))

        throughput = self.highResThroughput
        utcTime = datetime.utcnow()
        mtrStats = {
            "throughput": throughput,
            "timestamp": utcTime.isoformat(),
            "nodeName": self.name,
            # Multiply by 1000 for JavaScript date conversion
            "time": time.mktime(utcTime.timetuple()) * 1000
        }
        self._sendStatsDataIfRequired(
            EVENT_PERIODIC_STATS_THROUGHPUT, mtrStats)
        return mtrStats

    @property
    def masterLatency(self):
        master_latency, _ = self.getLatencies()
        return master_latency

    @property
    def avgBackupLatency(self):
        _, lat_backup = self.getLatencies()
        return lat_backup

    def sendLatencies(self):
        logger.debug("{} sending latencies".format(self))
        utcTime = datetime.utcnow()
        # Multiply by 1000 to make it compatible to JavaScript Date()
        jsTime = time.mktime(utcTime.timetuple()) * 1000

        latencies = dict(
            masterLatency=self.masterLatency,
            averageBackupLatency=self.avgBackupLatency,
            time=jsTime,
            nodeName=self.name,
            timestamp=utcTime.isoformat()
        )

        self._sendStatsDataIfRequired(
            EVENT_PERIODIC_STATS_LATENCIES, latencies)

    def sendKnownNodesInfo(self):
        logger.debug("{} sending nodestack".format(self))
        self._sendStatsDataIfRequired(
            EVENT_PERIODIC_STATS_NODES, remotesInfo(
                self.nodestack, self.blacklister))

    def sendSystemPerfomanceInfo(self):
        logger.debug("{} sending system performance".format(self))
        self._sendStatsDataIfRequired(
            EVENT_PERIODIC_STATS_SYSTEM_PERFORMANCE_INFO,
            self.captureSystemPerformance())

    def sendNodeInfo(self):
        logger.debug("{} sending node info".format(self))
        self._sendStatsDataIfRequired(
            EVENT_PERIODIC_STATS_NODE_INFO, self.nodeInfo['data'])

    def sendTotalRequests(self):
        logger.debug("{} sending total requests".format(self))

        totalRequests = dict(
            totalRequests=self.totalRequests
        )

        self._sendStatsDataIfRequired(
            EVENT_PERIODIC_STATS_TOTAL_REQUESTS, totalRequests)

    def captureSystemPerformance(self):
        logger.debug("{} capturing system performance".format(self))
        timestamp = time.time()
        cpu = psutil.cpu_percent(interval=None)
        ram = psutil.virtual_memory()
        curr_network = self.calculateTraffic()
        network = curr_network - self.lastKnownTraffic
        self.lastKnownTraffic = curr_network
        cpu_data = {
            'time': timestamp,
            'value': cpu
        }
        ram_data = {
            'time': timestamp,
            'value': ram.percent
        }
        traffic_data = {
            'time': timestamp,
            'value': network
        }
        return {
            'cpu': cpu_data,
            'ram': ram_data,
            'traffic': traffic_data
        }

    def postOnReqOrdered(self):
        utcTime = datetime.utcnow()
        # Multiply by 1000 to make it compatible to JavaScript Date()
        jsTime = time.mktime(utcTime.timetuple()) * 1000

        if self.totalViewChanges != self._lastPostedViewChange:
            self._lastPostedViewChange = self.totalViewChanges
            viewChange = dict(
                time=jsTime,
                viewChange=self._lastPostedViewChange
            )
            self._sendStatsDataIfRequired(EVENT_VIEW_CHANGE, viewChange)

        reqOrderedEventDict = dict(self.metrics())
        reqOrderedEventDict["created_at"] = utcTime.isoformat()
        reqOrderedEventDict["nodeName"] = self.name
        reqOrderedEventDict["time"] = jsTime
        reqOrderedEventDict["hasMasterPrimary"] = "Y" if self.hasMasterPrimary else "N"
        self._sendStatsDataIfRequired(EVENT_REQ_ORDERED, reqOrderedEventDict)
        self._clearSnapshot()

    def postOnNodeStarted(self, startedAt):
        throughputData = {
            "updateFrequency": self.config.DashboardUpdateFreq,
            "graphDuration": self.config.ThroughputGraphDuration
        }
        startedAtData = {"startedAt": startedAt, "ctx": "DEMO"}
        startedEventDict = {
            "startedAtData": startedAtData,
            "throughputConfig": throughputData
        }
        self._sendStatsDataIfRequired(EVENT_NODE_STARTED, startedEventDict)

    def _clearSnapshot(self):
        self.masterReqLatencyTooHigh = self.isMasterReqLatencyTooHigh()
        self.masterReqLatencies = {}

    def _sendStatsDataIfRequired(self, event, stats):
        if self.config.SendMonitorStats:
            for sc in self.statsConsumers:
                sc.sendStats(event, stats)

    @staticmethod
    def mean(data):
        return 0 if len(data) == 0 else mean(data)
def test_acc_monitor_works_even_with_one_instance(monitor: AccumulatingMonitorStrategy):
    monitor.remove_instance(1)

    start_time = 100.0
    monitor.update_time(start_time)
    monitor.request_received('A')
    monitor.update_time(start_time + 1.2 * ACC_MONITOR_TIMEOUT)
    assert not monitor.is_master_degraded()

    monitor.request_ordered('A', 0)
    monitor.update_time(start_time + 2.4 * ACC_MONITOR_TIMEOUT)
    assert not monitor.is_master_degraded()
def test_acc_monitor_behaves_as_expected(monitor: AccumulatingMonitorStrategy):
    # Simulate txns receiving and ordering
    triggered_time = simulate_ordering(monitor, 100.0, 1)

    # Monitor should not be triggered before timeout
    monitor.update_time(triggered_time + 0.8 * ACC_MONITOR_TIMEOUT)
    assert not monitor.is_master_degraded()
    assert not monitor.is_instance_degraded(1)

    # Monitor should be triggered on master after timeout
    monitor.update_time(triggered_time + 1.2 * ACC_MONITOR_TIMEOUT)
    assert monitor.is_master_degraded()
    assert not monitor.is_instance_degraded(1)

    # Monitor should not be triggered after reset
    monitor.reset()
    assert not monitor.is_master_degraded()
    assert not monitor.is_instance_degraded(1)

    # Give monitor chance to observe txn statistics
    reset_time = triggered_time + 2.4 * ACC_MONITOR_TIMEOUT
    monitor.update_time(reset_time)

    # Monitor should not be triggered if nothing is ordered
    monitor.update_time(reset_time + 1.2 * ACC_MONITOR_TIMEOUT)
    assert not monitor.is_master_degraded()
    assert not monitor.is_instance_degraded(1)
def test_acc_monitor_isnt_alerted_when_created(monitor: AccumulatingMonitorStrategy):
    assert not monitor.is_master_degraded()
    assert not monitor.is_instance_degraded(1)
def test_acc_monitor_handles_adding_instances(monitor: AccumulatingMonitorStrategy):
    monitor.add_instance(2)

    triggered_time = simulate_ordering(monitor, 100.0, 2)

    # Monitor should not be triggered before timeout
    monitor.update_time(triggered_time + 0.8 * ACC_MONITOR_TIMEOUT)
    assert not monitor.is_master_degraded()
    assert not monitor.is_instance_degraded(1)
    assert not monitor.is_instance_degraded(2)

    # Monitor should be triggered after timeout
    monitor.update_time(triggered_time + 1.2 * ACC_MONITOR_TIMEOUT)
    assert monitor.is_master_degraded()
    assert monitor.is_instance_degraded(1)  # TODO: Monitor behaves like this now, but is it okay?
    assert not monitor.is_instance_degraded(2)