def __init__(self, ip, port=9093, interval=10, stop_flag: threading.Event = None):
     super().__init__(name=self.__class__.__name__, daemon=True)
     self._alert_manager_url = f"http://{ip}:{port}/api/v2"
     self._stop_flag = stop_flag if stop_flag else threading.Event()
     self._interval = interval
     self._timeout = 600
     self.event_registry = ContinuousEventsRegistry()
Example #2
0
    def test_get_compact_events_by_continues_hash_from_log(
            self, populated_registry: ContinuousEventsRegistry):
        self._read_events_from_file("test_data/compaction_event_start.log")

        continues_hash = CompactionEvent.get_continuous_hash_from_dict({
            'node':
            'node1',
            'shard':
            '2',
            'table':
            'system.local',
            'compaction_process_id':
            'edc49670-2a65-11ec-a8b8-b62621e7624c'
        })
        found_events = populated_registry.find_continuous_events_by_hash(
            continues_hash)

        self._read_events_from_file("test_data/compaction_event_end.log")

        assert not populated_registry.find_continuous_events_by_hash(continues_hash), \
            "Event was not removed from registry"
        assert found_events
        found_event = found_events[-1]
        assert found_event
        assert isinstance(found_event, CompactionEvent)
        assert found_event.node == 'node1'
        assert found_event.shard == 2
        assert found_event.table == 'system.local'
        assert found_event.compaction_process_id == 'edc49670-2a65-11ec-a8b8-b62621e7624c'
    def test_get_events_by_period_type(
            self, populated_registry: ContinuousEventsRegistry,
            nodetool_event):
        count_of_begun_events_pre = len(
            populated_registry.get_events_by_period(
                period_type=EventPeriod.BEGIN))
        nodetool_event.begin_event()
        found_events = populated_registry.get_events_by_period(
            period_type=EventPeriod.BEGIN)

        assert len(found_events) == count_of_begun_events_pre + 1
def get_pattern_to_event_to_func_mapping(node: str) \
        -> List[ScyllaServerEventPatternFuncs]:
    """
    This function maps regex patterns, event classes and begin / end
    functions into ScyllaServerEventPatternFuncs object. Helper
    functions are delegated to find the event that should be the
    target of the start / stop action, or creating a new one.
    """
    mapping = []
    event_registry = ContinuousEventsRegistry()

    def _add_event(event_type: Type[ScyllaDatabaseContinuousEvent],
                   match: Match):
        kwargs = match.groupdict()
        if "shard" in kwargs:
            kwargs["shard"] = int(kwargs["shard"])
        event_type(node=node, **kwargs).begin_event()

    def _end_event(event_type: Type[ScyllaDatabaseContinuousEvent],
                   match: Match):
        kwargs = match.groupdict()
        continuous_hash = event_type.get_continuous_hash_from_dict({
            'node': node,
            **kwargs
        })
        if begin_event := event_registry.find_continuous_events_by_hash(
                continuous_hash):
            begin_event[-1].end_event()
            return
        TestFrameworkEvent(
            source=event_type.__name__,
            message=
            f"Did not find events of type {event_type} with hash {continuous_hash} ({kwargs})"
            f" with period type {EventPeriod.BEGIN.value}",
            severity=Severity.DEBUG).publish_or_dump()
    def test_get_event_by_id(self,
                             populated_registry: ContinuousEventsRegistry):
        some_event = random.choice(populated_registry.continuous_events)
        some_event_id = some_event.event_id
        found_event = populated_registry.get_event_by_id(some_event_id)

        assert found_event.event_id == some_event.event_id
    def test_get_compact_events_by_attr_from_log(
            self, populated_registry: ContinuousEventsRegistry):
        with Path(__file__).parent.joinpath(
                "test_data/compaction_event.log").open(
                    encoding="utf-8") as sct_log:
            for line in sct_log.readlines():
                db_event_pattern_func_map = get_pattern_to_event_to_func_mapping(
                    node='node1')
                for item in db_event_pattern_func_map:
                    event_match = item.pattern.search(line)
                    if event_match:
                        try:
                            item.period_func(match=event_match)
                        except RuntimeError as rex:
                            # Ignore the fact that the event is not published. It still will be created
                            if 'You should create default EventsProcessRegistry first' in str(
                                    rex):
                                pass

        registry_filter = populated_registry.get_registry_filter()
        found_events = registry_filter.filter_by_attr(
            base="CompactionEvent",
            severity=Severity.NORMAL,
            period_type=EventPeriod.END.value,
            table='system.local').get_filtered()

        assert len(found_events) == 2, f"Found events: {found_events}"
        assert sorted([event.compaction_process_id for event in found_events]) == \
            ['7c58a350-2a65-11ec-b5b3-d14f790022cc', 'edc49670-2a65-11ec-a8b8-b62621e7624c']
    def test_get_events_by_type(self,
                                populated_registry: ContinuousEventsRegistry):
        gemini_events_in_registry_count = len([
            e for e in populated_registry.continuous_events
            if isinstance(e, GeminiStressEvent)
        ])
        found_gemini_events = populated_registry.get_events_by_type(
            event_type=GeminiStressEvent)

        assert len(found_gemini_events) == gemini_events_in_registry_count
        for event in found_gemini_events:
            assert isinstance(event, GeminiStressEvent)
    def test_get_events_by_attr(self,
                                populated_registry: ContinuousEventsRegistry,
                                nodetool_event):
        nodetool_event.nodetool_command = 'test_get_events_by_attr'
        nodetool_event.event_id = 'dc4c854c-6bb5-4689-9af6-a9aae225611a'
        nodetool_event.begin_event()
        registry_filter = populated_registry.get_registry_filter()
        found_events = registry_filter.filter_by_attr(
            base="NodetoolEvent",
            severity=Severity.NORMAL,
            period_type=EventPeriod.BEGIN.value,
            nodetool_command='test_get_events_by_attr')

        assert len(found_events.get_filtered()) == 1
        assert found_events.get_filtered()[0] == nodetool_event
def get_pattern_to_event_to_func_mapping(node: str) \
        -> List[ScyllaServerEventPatternFuncs]:
    """
    This function maps regex patterns, event classes and begin / end
    functions into ScyllaServerEventPatternFuncs object. Helper
    functions are delegated to find the event that should be the
    target of the start / stop action, or creating a new one.
    """
    mapping = []
    event_registry = ContinuousEventsRegistry()

    def _add_event(event_type: Type[ScyllaDatabaseContinuousEvent],
                   match: Match):
        kwargs = match.groupdict()
        if "shard" in kwargs:
            kwargs["shard"] = int(kwargs["shard"])

        new_event = event_type(node=node, **kwargs)
        new_event.begin_event()

    def _end_event(event_type: Type[ScyllaDatabaseContinuousEvent],
                   match: Match):
        kwargs = match.groupdict()

        event_filter = event_registry.get_registry_filter()
        event_filter \
            .filter_by_node(node=node) \
            .filter_by_type(event_type=event_type) \
            .filter_by_period(period_type=EventPeriod.BEGIN.value)

        if kwargs.get("shard"):
            event_filter.filter_by_shard(int(kwargs["shard"]))

        if kwargs.get("table"):
            event_filter.filter_by_attr(base="CompactionEvent",
                                        table=kwargs["table"])

        if kwargs.get("compaction_process_id"):
            event_filter.filter_by_attr(
                base="CompactionEvent",
                compaction_process_id=kwargs["compaction_process_id"])

        begun_events = event_filter.get_filtered()

        if not begun_events:
            TestFrameworkEvent(
                source=event_type.__name__,
                message="Did not find any events of type {event_type}"
                " with period type {period_type}, event data: {event_data}".
                format(
                    event_type=event_type,
                    period_type=EventPeriod.BEGIN.value,
                    event_data=kwargs,
                ),
                severity=Severity.ERROR).publish_or_dump()
            return

        if len(begun_events) > 1:
            LOGGER.debug(
                "Found %s events of type %s with period %s. "
                "Will apply the function to most recent event by default.",
                len(begun_events), event_type, EventPeriod.BEGIN.value)
        event = begun_events[-1]
        event.end_event()

    for event in SCYLLA_DATABASE_CONTINUOUS_EVENTS:
        mapping.append(
            ScyllaServerEventPatternFuncs(
                pattern=re.compile(event.begin_pattern),
                event_class=event,
                period_func=partial(_add_event, event_type=event)))
        mapping.append(
            ScyllaServerEventPatternFuncs(
                pattern=re.compile(event.end_pattern),
                event_class=event,
                period_func=partial(_end_event, event_type=event)))

    return mapping
Example #10
0
class PrometheusAlertManagerListener(threading.Thread):
    def __init__(self,
                 ip,
                 port=9093,
                 interval=10,
                 stop_flag: threading.Event = None):
        super().__init__(name=self.__class__.__name__, daemon=True)
        self._alert_manager_url = f"http://{ip}:{port}/api/v2"
        self._stop_flag = stop_flag if stop_flag else threading.Event()
        self._interval = interval
        self._timeout = 600
        self.event_registry = ContinuousEventsRegistry()

    @property
    def is_alert_manager_up(self):
        try:
            return requests.get(
                f"{self._alert_manager_url}/status",
                timeout=3).json()['cluster']['status'] == 'ready'
        except Exception:  # pylint: disable=broad-except
            return False

    @log_run_info
    def wait_till_alert_manager_up(self):
        end_time = time.time() + self._timeout
        while time.time() < end_time and not self._stop_flag.is_set():
            if self.is_alert_manager_up:
                return
            time.sleep(30)
        if self._stop_flag.is_set():
            LOGGER.warning("Prometheus Alert Manager was asked to stop.")
        else:
            raise TimeoutError(
                f"Prometheus Alert Manager({self._alert_manager_url}) "
                f"did not get up for {self._timeout}s")

    @log_run_info
    def stop(self):
        self._stop_flag.set()

    @retrying(n=10)
    def _get_alerts(self, active=False):
        if active:
            response = requests.get(
                f"{self._alert_manager_url}/alerts?active={int(active)}",
                timeout=3)
        else:
            response = requests.get(f"{self._alert_manager_url}/alerts",
                                    timeout=3)
        if response.status_code == 200:
            return response.json()
        return None

    def _publish_new_alerts(self, alerts: dict):  # pylint: disable=no-self-use
        for alert in alerts.values():
            PrometheusAlertManagerEvent(raw_alert=alert).begin_event()

    def _publish_end_of_alerts(self, alerts: dict):
        all_alerts = self._get_alerts()
        updated_dict = {}
        if all_alerts:
            for alert in all_alerts:
                fingerprint = alert.get('fingerprint', None)
                if not fingerprint:
                    continue
                updated_dict[fingerprint] = alert
        for alert in alerts.values():
            if not alert.get('endsAt', None):
                alert['endsAt'] = time.strftime("%Y-%m-%dT%H:%M:%S.0Z",
                                                time.gmtime())
            alert = updated_dict.get(alert['fingerprint'], alert)
            labels = alert.get("labels") or {}
            alert_name = labels.get("alertname", "")
            node = labels.get("instance", "N/A")

            continuous_hash = PrometheusAlertManagerEvent.get_continuous_hash_from_dict(
                {
                    'node': node,
                    'starts_at': alert.get("startsAt"),
                    'alert_name': alert_name
                })

            if begin_event := self.event_registry.find_continuous_events_by_hash(
                    continuous_hash):
                begin_event[-1].end_event()
                continue

            new_event = PrometheusAlertManagerEvent(raw_alert=alert)
            new_event.period_type = EventPeriod.INFORMATIONAL.value
            new_event.end_event()
class PrometheusAlertManagerListener(threading.Thread):
    def __init__(self,
                 ip,
                 port=9093,
                 interval=10,
                 stop_flag: threading.Event = None):
        super().__init__(name=self.__class__.__name__, daemon=True)
        self._alert_manager_url = f"http://{ip}:{port}/api/v2"
        self._stop_flag = stop_flag if stop_flag else threading.Event()
        self._interval = interval
        self._timeout = 600
        self.event_registry = ContinuousEventsRegistry()

    @property
    def is_alert_manager_up(self):
        try:
            return requests.get(
                f"{self._alert_manager_url}/status",
                timeout=3).json()['cluster']['status'] == 'ready'
        except Exception:  # pylint: disable=broad-except
            return False

    @log_run_info
    def wait_till_alert_manager_up(self):
        end_time = time.time() + self._timeout
        while time.time() < end_time and not self._stop_flag.is_set():
            if self.is_alert_manager_up:
                return
            time.sleep(30)
        if self._stop_flag.is_set():
            LOGGER.warning("Prometheus Alert Manager was asked to stop.")
        else:
            raise TimeoutError(
                f"Prometheus Alert Manager({self._alert_manager_url}) "
                f"did not get up for {self._timeout}s")

    @log_run_info
    def stop(self):
        self._stop_flag.set()

    @retrying(n=10)
    def _get_alerts(self, active=False):
        if active:
            response = requests.get(
                f"{self._alert_manager_url}/alerts?active={int(active)}",
                timeout=3)
        else:
            response = requests.get(f"{self._alert_manager_url}/alerts",
                                    timeout=3)
        if response.status_code == 200:
            return response.json()
        return None

    def _publish_new_alerts(self, alerts: dict):  # pylint: disable=no-self-use
        for alert in alerts.values():
            new_event = PrometheusAlertManagerEvent(raw_alert=alert)
            new_event.begin_event()

    def _publish_end_of_alerts(self, alerts: dict):
        all_alerts = self._get_alerts()
        updated_dict = {}
        event_filter = self.event_registry.get_registry_filter()
        if all_alerts:
            for alert in all_alerts:
                fingerprint = alert.get('fingerprint', None)
                if not fingerprint:
                    continue
                updated_dict[fingerprint] = alert
        for alert in alerts.values():
            if not alert.get('endsAt', None):
                alert['endsAt'] = time.strftime("%Y-%m-%dT%H:%M:%S.0Z",
                                                time.gmtime())
            alert = updated_dict.get(alert['fingerprint'], alert)
            labels = alert.get("labels") or {}
            alert_name = labels.get("alertname", "")
            node = labels.get("instance", "N/A")

            event_filter.filter_by_attr(base="PrometheusAlertManagerEvent",
                                        node=node,
                                        starts_at=alert.get("startsAt"),
                                        alert_name=alert_name,
                                        period_type=EventPeriod.BEGIN.value)

            begun_events = event_filter.get_filtered()
            if not begun_events:
                new_event = PrometheusAlertManagerEvent(raw_alert=alert)
                new_event.period_type = EventPeriod.INFORMATIONAL.value
                new_event.end_event()
                return

            if len(begun_events) > 1:
                LOGGER.debug(
                    "Found %s events of type '%s' started at %s with period %s. "
                    "Will apply the function to most recent event by default.",
                    len(begun_events), alert_name, alert.get("startsAt"),
                    EventPeriod.BEGIN.value)

            event = begun_events[-1]
            event.end_event()

    def run(self):
        self.wait_till_alert_manager_up()
        existed = {}
        while not self._stop_flag.is_set():
            start_time = time.time()
            just_left = existed.copy()
            existing = {}
            new_ones = {}
            alerts = self._get_alerts(active=True)
            if alerts is not None:
                for alert in alerts:
                    fingerprint = alert.get('fingerprint', None)
                    if not fingerprint:
                        continue
                    state = alert.get('status', {}).get('state', '')
                    if state == 'suppressed':
                        continue
                    existing[fingerprint] = alert
                    if fingerprint in just_left:
                        del just_left[fingerprint]
                        continue
                    new_ones[fingerprint] = alert
                existed = existing
            self._publish_new_alerts(new_ones)
            self._publish_end_of_alerts(just_left)
            delta = int((start_time + self._interval) - time.time())
            if delta > 0:
                time.sleep(int(delta))

    def silence(self,
                alert_name: str,
                duration: Optional[int] = None,
                start: Optional[datetime.datetime] = None,
                end: Optional[datetime.datetime] = None) -> str:
        """
        Silence an alert for a duration of time

        :param alert_name: name of the alert as it configured in prometheus
        :param duration: duration time in seconds, if None, start and end must be defined
        :param start: if None, would be default to current utc time
        :param end: if None, will be calculated by duration
        :return: silenceID
        """

        assert duration or (start and
                            end), "should define duration or (start and end)"
        if not start:
            start = datetime.datetime.utcnow()
        if not end:
            end = start + datetime.timedelta(seconds=duration)
        silence_data = {
            "matchers": [{
                "name": "alertname",
                "value": alert_name,
                "isRegex": True
            }],
            "startsAt":
            start.isoformat("T") + "Z",
            "endsAt":
            end.isoformat("T") + "Z",
            "createdBy":
            "SCT",
            "comment":
            "Silence by SCT code",
            "status": {
                "state": "active"
            }
        }
        res = requests.post(f"{self._alert_manager_url}/silences",
                            timeout=3,
                            json=silence_data)
        res.raise_for_status()
        return res.json()['silenceID']

    def delete_silence(self, silence_id: str) -> None:
        """
        delete a alert silence

        :param silence_id: silence id returned from `silence()` api call
        :return:
        """
        res = requests.delete(
            f"{self._alert_manager_url}/silence/{silence_id}", timeout=3)
        res.raise_for_status()
 def test_adding_a_non_continuous_event_raises_error(
         self, registry: ContinuousEventsRegistry,
         full_scan_event: FullScanEvent):
     with pytest.raises(ContinuousEventRegistryException):
         registry.add_event(full_scan_event)
 def registry(self):
     yield ContinuousEventsRegistry()
 def test_adding_a_non_continuous_event_raises_error(
         self, registry: ContinuousEventsRegistry, info_event):
     with pytest.raises(ContinuousEventRegistryException):
         registry.add_event(info_event)
def get_pattern_to_event_to_func_mapping(node: str) \
        -> List[ScyllaServerEventPatternFuncs]:
    """
    This function maps regex patterns, event classes and begin / end
    functions into ScyllaServerEventPatternFuncs object. Helper
    functions are delegated to find the event that should be the
    target of the start / stop action, or creating a new one.
    """
    mapping = []
    event_registry = ContinuousEventsRegistry()

    def _add_event(event_type: Type[ScyllaDatabaseContinuousEvent],
                   match: Match):
        shard = int(match.groupdict()
                    ["shard"]) if "shard" in match.groupdict().keys() else None
        new_event = event_type(node=node, shard=shard)
        new_event.begin_event()

    def _end_event(event_type: Type[ScyllaDatabaseContinuousEvent],
                   match: Match):
        shard = int(match.groupdict()
                    ["shard"]) if "shard" in match.groupdict().keys() else None
        event_filter = event_registry.get_registry_filter()
        event_filter \
            .filter_by_node(node=node) \
            .filter_by_type(event_type=event_type) \
            .filter_by_period(period_type=EventPeriod.BEGIN.value)

        if shard is not None:
            event_filter.filter_by_shard(shard)

        begun_events = event_filter.get_filtered()

        if not begun_events:
            raise ContinuousEventRegistryException(
                "Did not find any events of type {event_type}"
                "with period type {period_type}.".format(
                    event_type=event_type,
                    period_type=EventPeriod.BEGIN.value))
        if len(begun_events) > 1:
            LOGGER.warning(
                "Found {event_count} events of type {event_type} with period {event_period}. "
                "Will apply the function to most recent event by default.".
                format(event_count=len(begun_events),
                       event_type=event_type,
                       event_period=EventPeriod.BEGIN.value))
        event = begun_events[-1]
        event.end_event()

    for event in SCYLLA_DATABASE_CONTINUOUS_EVENTS:
        mapping.append(
            ScyllaServerEventPatternFuncs(
                pattern=re.compile(event.begin_pattern),
                event_class=event,
                period_func=partial(_add_event, event_type=event)))
        mapping.append(
            ScyllaServerEventPatternFuncs(
                pattern=re.compile(event.end_pattern),
                event_class=event,
                period_func=partial(_end_event, event_type=event)))

    return mapping