def __init__(self, ip, port=9093, interval=10, stop_flag: threading.Event = None): super().__init__(name=self.__class__.__name__, daemon=True) self._alert_manager_url = f"http://{ip}:{port}/api/v2" self._stop_flag = stop_flag if stop_flag else threading.Event() self._interval = interval self._timeout = 600 self.event_registry = ContinuousEventsRegistry()
def test_get_compact_events_by_continues_hash_from_log( self, populated_registry: ContinuousEventsRegistry): self._read_events_from_file("test_data/compaction_event_start.log") continues_hash = CompactionEvent.get_continuous_hash_from_dict({ 'node': 'node1', 'shard': '2', 'table': 'system.local', 'compaction_process_id': 'edc49670-2a65-11ec-a8b8-b62621e7624c' }) found_events = populated_registry.find_continuous_events_by_hash( continues_hash) self._read_events_from_file("test_data/compaction_event_end.log") assert not populated_registry.find_continuous_events_by_hash(continues_hash), \ "Event was not removed from registry" assert found_events found_event = found_events[-1] assert found_event assert isinstance(found_event, CompactionEvent) assert found_event.node == 'node1' assert found_event.shard == 2 assert found_event.table == 'system.local' assert found_event.compaction_process_id == 'edc49670-2a65-11ec-a8b8-b62621e7624c'
def test_get_events_by_period_type( self, populated_registry: ContinuousEventsRegistry, nodetool_event): count_of_begun_events_pre = len( populated_registry.get_events_by_period( period_type=EventPeriod.BEGIN)) nodetool_event.begin_event() found_events = populated_registry.get_events_by_period( period_type=EventPeriod.BEGIN) assert len(found_events) == count_of_begun_events_pre + 1
def get_pattern_to_event_to_func_mapping(node: str) \ -> List[ScyllaServerEventPatternFuncs]: """ This function maps regex patterns, event classes and begin / end functions into ScyllaServerEventPatternFuncs object. Helper functions are delegated to find the event that should be the target of the start / stop action, or creating a new one. """ mapping = [] event_registry = ContinuousEventsRegistry() def _add_event(event_type: Type[ScyllaDatabaseContinuousEvent], match: Match): kwargs = match.groupdict() if "shard" in kwargs: kwargs["shard"] = int(kwargs["shard"]) event_type(node=node, **kwargs).begin_event() def _end_event(event_type: Type[ScyllaDatabaseContinuousEvent], match: Match): kwargs = match.groupdict() continuous_hash = event_type.get_continuous_hash_from_dict({ 'node': node, **kwargs }) if begin_event := event_registry.find_continuous_events_by_hash( continuous_hash): begin_event[-1].end_event() return TestFrameworkEvent( source=event_type.__name__, message= f"Did not find events of type {event_type} with hash {continuous_hash} ({kwargs})" f" with period type {EventPeriod.BEGIN.value}", severity=Severity.DEBUG).publish_or_dump()
def test_get_event_by_id(self, populated_registry: ContinuousEventsRegistry): some_event = random.choice(populated_registry.continuous_events) some_event_id = some_event.event_id found_event = populated_registry.get_event_by_id(some_event_id) assert found_event.event_id == some_event.event_id
def test_get_compact_events_by_attr_from_log( self, populated_registry: ContinuousEventsRegistry): with Path(__file__).parent.joinpath( "test_data/compaction_event.log").open( encoding="utf-8") as sct_log: for line in sct_log.readlines(): db_event_pattern_func_map = get_pattern_to_event_to_func_mapping( node='node1') for item in db_event_pattern_func_map: event_match = item.pattern.search(line) if event_match: try: item.period_func(match=event_match) except RuntimeError as rex: # Ignore the fact that the event is not published. It still will be created if 'You should create default EventsProcessRegistry first' in str( rex): pass registry_filter = populated_registry.get_registry_filter() found_events = registry_filter.filter_by_attr( base="CompactionEvent", severity=Severity.NORMAL, period_type=EventPeriod.END.value, table='system.local').get_filtered() assert len(found_events) == 2, f"Found events: {found_events}" assert sorted([event.compaction_process_id for event in found_events]) == \ ['7c58a350-2a65-11ec-b5b3-d14f790022cc', 'edc49670-2a65-11ec-a8b8-b62621e7624c']
def test_get_events_by_type(self, populated_registry: ContinuousEventsRegistry): gemini_events_in_registry_count = len([ e for e in populated_registry.continuous_events if isinstance(e, GeminiStressEvent) ]) found_gemini_events = populated_registry.get_events_by_type( event_type=GeminiStressEvent) assert len(found_gemini_events) == gemini_events_in_registry_count for event in found_gemini_events: assert isinstance(event, GeminiStressEvent)
def test_get_events_by_attr(self, populated_registry: ContinuousEventsRegistry, nodetool_event): nodetool_event.nodetool_command = 'test_get_events_by_attr' nodetool_event.event_id = 'dc4c854c-6bb5-4689-9af6-a9aae225611a' nodetool_event.begin_event() registry_filter = populated_registry.get_registry_filter() found_events = registry_filter.filter_by_attr( base="NodetoolEvent", severity=Severity.NORMAL, period_type=EventPeriod.BEGIN.value, nodetool_command='test_get_events_by_attr') assert len(found_events.get_filtered()) == 1 assert found_events.get_filtered()[0] == nodetool_event
def get_pattern_to_event_to_func_mapping(node: str) \ -> List[ScyllaServerEventPatternFuncs]: """ This function maps regex patterns, event classes and begin / end functions into ScyllaServerEventPatternFuncs object. Helper functions are delegated to find the event that should be the target of the start / stop action, or creating a new one. """ mapping = [] event_registry = ContinuousEventsRegistry() def _add_event(event_type: Type[ScyllaDatabaseContinuousEvent], match: Match): kwargs = match.groupdict() if "shard" in kwargs: kwargs["shard"] = int(kwargs["shard"]) new_event = event_type(node=node, **kwargs) new_event.begin_event() def _end_event(event_type: Type[ScyllaDatabaseContinuousEvent], match: Match): kwargs = match.groupdict() event_filter = event_registry.get_registry_filter() event_filter \ .filter_by_node(node=node) \ .filter_by_type(event_type=event_type) \ .filter_by_period(period_type=EventPeriod.BEGIN.value) if kwargs.get("shard"): event_filter.filter_by_shard(int(kwargs["shard"])) if kwargs.get("table"): event_filter.filter_by_attr(base="CompactionEvent", table=kwargs["table"]) if kwargs.get("compaction_process_id"): event_filter.filter_by_attr( base="CompactionEvent", compaction_process_id=kwargs["compaction_process_id"]) begun_events = event_filter.get_filtered() if not begun_events: TestFrameworkEvent( source=event_type.__name__, message="Did not find any events of type {event_type}" " with period type {period_type}, event data: {event_data}". format( event_type=event_type, period_type=EventPeriod.BEGIN.value, event_data=kwargs, ), severity=Severity.ERROR).publish_or_dump() return if len(begun_events) > 1: LOGGER.debug( "Found %s events of type %s with period %s. " "Will apply the function to most recent event by default.", len(begun_events), event_type, EventPeriod.BEGIN.value) event = begun_events[-1] event.end_event() for event in SCYLLA_DATABASE_CONTINUOUS_EVENTS: mapping.append( ScyllaServerEventPatternFuncs( pattern=re.compile(event.begin_pattern), event_class=event, period_func=partial(_add_event, event_type=event))) mapping.append( ScyllaServerEventPatternFuncs( pattern=re.compile(event.end_pattern), event_class=event, period_func=partial(_end_event, event_type=event))) return mapping
class PrometheusAlertManagerListener(threading.Thread): def __init__(self, ip, port=9093, interval=10, stop_flag: threading.Event = None): super().__init__(name=self.__class__.__name__, daemon=True) self._alert_manager_url = f"http://{ip}:{port}/api/v2" self._stop_flag = stop_flag if stop_flag else threading.Event() self._interval = interval self._timeout = 600 self.event_registry = ContinuousEventsRegistry() @property def is_alert_manager_up(self): try: return requests.get( f"{self._alert_manager_url}/status", timeout=3).json()['cluster']['status'] == 'ready' except Exception: # pylint: disable=broad-except return False @log_run_info def wait_till_alert_manager_up(self): end_time = time.time() + self._timeout while time.time() < end_time and not self._stop_flag.is_set(): if self.is_alert_manager_up: return time.sleep(30) if self._stop_flag.is_set(): LOGGER.warning("Prometheus Alert Manager was asked to stop.") else: raise TimeoutError( f"Prometheus Alert Manager({self._alert_manager_url}) " f"did not get up for {self._timeout}s") @log_run_info def stop(self): self._stop_flag.set() @retrying(n=10) def _get_alerts(self, active=False): if active: response = requests.get( f"{self._alert_manager_url}/alerts?active={int(active)}", timeout=3) else: response = requests.get(f"{self._alert_manager_url}/alerts", timeout=3) if response.status_code == 200: return response.json() return None def _publish_new_alerts(self, alerts: dict): # pylint: disable=no-self-use for alert in alerts.values(): PrometheusAlertManagerEvent(raw_alert=alert).begin_event() def _publish_end_of_alerts(self, alerts: dict): all_alerts = self._get_alerts() updated_dict = {} if all_alerts: for alert in all_alerts: fingerprint = alert.get('fingerprint', None) if not fingerprint: continue updated_dict[fingerprint] = alert for alert in alerts.values(): if not alert.get('endsAt', None): alert['endsAt'] = time.strftime("%Y-%m-%dT%H:%M:%S.0Z", time.gmtime()) alert = updated_dict.get(alert['fingerprint'], alert) labels = alert.get("labels") or {} alert_name = labels.get("alertname", "") node = labels.get("instance", "N/A") continuous_hash = PrometheusAlertManagerEvent.get_continuous_hash_from_dict( { 'node': node, 'starts_at': alert.get("startsAt"), 'alert_name': alert_name }) if begin_event := self.event_registry.find_continuous_events_by_hash( continuous_hash): begin_event[-1].end_event() continue new_event = PrometheusAlertManagerEvent(raw_alert=alert) new_event.period_type = EventPeriod.INFORMATIONAL.value new_event.end_event()
class PrometheusAlertManagerListener(threading.Thread): def __init__(self, ip, port=9093, interval=10, stop_flag: threading.Event = None): super().__init__(name=self.__class__.__name__, daemon=True) self._alert_manager_url = f"http://{ip}:{port}/api/v2" self._stop_flag = stop_flag if stop_flag else threading.Event() self._interval = interval self._timeout = 600 self.event_registry = ContinuousEventsRegistry() @property def is_alert_manager_up(self): try: return requests.get( f"{self._alert_manager_url}/status", timeout=3).json()['cluster']['status'] == 'ready' except Exception: # pylint: disable=broad-except return False @log_run_info def wait_till_alert_manager_up(self): end_time = time.time() + self._timeout while time.time() < end_time and not self._stop_flag.is_set(): if self.is_alert_manager_up: return time.sleep(30) if self._stop_flag.is_set(): LOGGER.warning("Prometheus Alert Manager was asked to stop.") else: raise TimeoutError( f"Prometheus Alert Manager({self._alert_manager_url}) " f"did not get up for {self._timeout}s") @log_run_info def stop(self): self._stop_flag.set() @retrying(n=10) def _get_alerts(self, active=False): if active: response = requests.get( f"{self._alert_manager_url}/alerts?active={int(active)}", timeout=3) else: response = requests.get(f"{self._alert_manager_url}/alerts", timeout=3) if response.status_code == 200: return response.json() return None def _publish_new_alerts(self, alerts: dict): # pylint: disable=no-self-use for alert in alerts.values(): new_event = PrometheusAlertManagerEvent(raw_alert=alert) new_event.begin_event() def _publish_end_of_alerts(self, alerts: dict): all_alerts = self._get_alerts() updated_dict = {} event_filter = self.event_registry.get_registry_filter() if all_alerts: for alert in all_alerts: fingerprint = alert.get('fingerprint', None) if not fingerprint: continue updated_dict[fingerprint] = alert for alert in alerts.values(): if not alert.get('endsAt', None): alert['endsAt'] = time.strftime("%Y-%m-%dT%H:%M:%S.0Z", time.gmtime()) alert = updated_dict.get(alert['fingerprint'], alert) labels = alert.get("labels") or {} alert_name = labels.get("alertname", "") node = labels.get("instance", "N/A") event_filter.filter_by_attr(base="PrometheusAlertManagerEvent", node=node, starts_at=alert.get("startsAt"), alert_name=alert_name, period_type=EventPeriod.BEGIN.value) begun_events = event_filter.get_filtered() if not begun_events: new_event = PrometheusAlertManagerEvent(raw_alert=alert) new_event.period_type = EventPeriod.INFORMATIONAL.value new_event.end_event() return if len(begun_events) > 1: LOGGER.debug( "Found %s events of type '%s' started at %s with period %s. " "Will apply the function to most recent event by default.", len(begun_events), alert_name, alert.get("startsAt"), EventPeriod.BEGIN.value) event = begun_events[-1] event.end_event() def run(self): self.wait_till_alert_manager_up() existed = {} while not self._stop_flag.is_set(): start_time = time.time() just_left = existed.copy() existing = {} new_ones = {} alerts = self._get_alerts(active=True) if alerts is not None: for alert in alerts: fingerprint = alert.get('fingerprint', None) if not fingerprint: continue state = alert.get('status', {}).get('state', '') if state == 'suppressed': continue existing[fingerprint] = alert if fingerprint in just_left: del just_left[fingerprint] continue new_ones[fingerprint] = alert existed = existing self._publish_new_alerts(new_ones) self._publish_end_of_alerts(just_left) delta = int((start_time + self._interval) - time.time()) if delta > 0: time.sleep(int(delta)) def silence(self, alert_name: str, duration: Optional[int] = None, start: Optional[datetime.datetime] = None, end: Optional[datetime.datetime] = None) -> str: """ Silence an alert for a duration of time :param alert_name: name of the alert as it configured in prometheus :param duration: duration time in seconds, if None, start and end must be defined :param start: if None, would be default to current utc time :param end: if None, will be calculated by duration :return: silenceID """ assert duration or (start and end), "should define duration or (start and end)" if not start: start = datetime.datetime.utcnow() if not end: end = start + datetime.timedelta(seconds=duration) silence_data = { "matchers": [{ "name": "alertname", "value": alert_name, "isRegex": True }], "startsAt": start.isoformat("T") + "Z", "endsAt": end.isoformat("T") + "Z", "createdBy": "SCT", "comment": "Silence by SCT code", "status": { "state": "active" } } res = requests.post(f"{self._alert_manager_url}/silences", timeout=3, json=silence_data) res.raise_for_status() return res.json()['silenceID'] def delete_silence(self, silence_id: str) -> None: """ delete a alert silence :param silence_id: silence id returned from `silence()` api call :return: """ res = requests.delete( f"{self._alert_manager_url}/silence/{silence_id}", timeout=3) res.raise_for_status()
def test_adding_a_non_continuous_event_raises_error( self, registry: ContinuousEventsRegistry, full_scan_event: FullScanEvent): with pytest.raises(ContinuousEventRegistryException): registry.add_event(full_scan_event)
def registry(self): yield ContinuousEventsRegistry()
def test_adding_a_non_continuous_event_raises_error( self, registry: ContinuousEventsRegistry, info_event): with pytest.raises(ContinuousEventRegistryException): registry.add_event(info_event)
def get_pattern_to_event_to_func_mapping(node: str) \ -> List[ScyllaServerEventPatternFuncs]: """ This function maps regex patterns, event classes and begin / end functions into ScyllaServerEventPatternFuncs object. Helper functions are delegated to find the event that should be the target of the start / stop action, or creating a new one. """ mapping = [] event_registry = ContinuousEventsRegistry() def _add_event(event_type: Type[ScyllaDatabaseContinuousEvent], match: Match): shard = int(match.groupdict() ["shard"]) if "shard" in match.groupdict().keys() else None new_event = event_type(node=node, shard=shard) new_event.begin_event() def _end_event(event_type: Type[ScyllaDatabaseContinuousEvent], match: Match): shard = int(match.groupdict() ["shard"]) if "shard" in match.groupdict().keys() else None event_filter = event_registry.get_registry_filter() event_filter \ .filter_by_node(node=node) \ .filter_by_type(event_type=event_type) \ .filter_by_period(period_type=EventPeriod.BEGIN.value) if shard is not None: event_filter.filter_by_shard(shard) begun_events = event_filter.get_filtered() if not begun_events: raise ContinuousEventRegistryException( "Did not find any events of type {event_type}" "with period type {period_type}.".format( event_type=event_type, period_type=EventPeriod.BEGIN.value)) if len(begun_events) > 1: LOGGER.warning( "Found {event_count} events of type {event_type} with period {event_period}. " "Will apply the function to most recent event by default.". format(event_count=len(begun_events), event_type=event_type, event_period=EventPeriod.BEGIN.value)) event = begun_events[-1] event.end_event() for event in SCYLLA_DATABASE_CONTINUOUS_EVENTS: mapping.append( ScyllaServerEventPatternFuncs( pattern=re.compile(event.begin_pattern), event_class=event, period_func=partial(_add_event, event_type=event))) mapping.append( ScyllaServerEventPatternFuncs( pattern=re.compile(event.end_pattern), event_class=event, period_func=partial(_end_event, event_type=event))) return mapping