def test_severity_changer_db_log(self): """ See https://github.com/scylladb/scylla-cluster-tests/issues/2115 """ # 1) Lower DatabaseLogEvent to WARNING for 1 sec. with self.wait_for_n_events(self.get_events_logger(), count=4, timeout=3): with EventsSeverityChangerFilter(new_severity=Severity.WARNING, event_class=DatabaseLogEvent, extra_time_to_expiration=1): DatabaseLogEvent.NO_SPACE_ERROR() \ .add_info(node="A", line_number=22, line="critical that should be lowered #1") \ .publish() DatabaseLogEvent.NO_SPACE_ERROR() \ .add_info(node="A", line_number=22, line="critical that should be lowered #2") \ .publish() log_content = self.get_event_log_file("warning.log") self.assertIn("DatabaseLogEvent", log_content) self.assertIn("critical that should be lowered #1", log_content) self.assertIn("critical that should be lowered #2", log_content) # 2) One of the next DatabaseLogEvent event should expire the EventsSeverityChangerFilter # (and not crash all subscribers) with self.wait_for_n_events(self.get_events_logger(), count=2, timeout=3): for _ in range(2): time.sleep(1) DatabaseLogEvent.NO_SPACE_ERROR() \ .add_info(node="A", line_number=22, line="critical that shouldn't be lowered") \ .publish() log_content = self.get_event_log_file("error.log") self.assertIn("critical that shouldn't be lowered", log_content)
def test_filter(self): enospc_line_1 = \ "[99.80.124.204] [stdout] Mar 31 09:08:10 warning| [shard 8] commitlog - Exception in segment " \ "reservation: storage_io_error (Storage I/O error: 28: No space left on device)" enospc_line_2 = \ "2019-10-29T12:19:49+00:00 ip-172-30-0-184 !WARNING | scylla: [shard 2] storage_service - Commitlog " \ "error: std::filesystem::__cxx11::filesystem_error (error system:28, filesystem error: open failed: No " \ "space left on device [/var/lib/scylla/hints/2/172.30.0.116/HintsLog-1-36028797019122576.log])" log_content_before = self.get_event_log_file("events.log") # 13 events in total: 2 events per filter x 4 filters + 5 events. with self.wait_for_n_events(self.get_events_logger(), count=13, timeout=3): with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR), \ DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE, line="No space left on device"), \ DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="No space left on device"), \ DbEventsFilter(db_event=DatabaseLogEvent.FILESYSTEM_ERROR, line="No space left on device"): DatabaseLogEvent.NO_SPACE_ERROR().add_info( node="A", line_number=22, line=enospc_line_1).publish() DatabaseLogEvent.BACKTRACE().add_info( node="A", line_number=22, line=enospc_line_1).publish() DatabaseLogEvent.FILESYSTEM_ERROR().add_info( node="A", line_number=22, line=enospc_line_2).publish() DatabaseLogEvent.DATABASE_ERROR().add_info( node="A", line_number=22, line=enospc_line_1).publish() DatabaseLogEvent.NO_SPACE_ERROR().add_info( node="A", line_number=22, line=enospc_line_1).publish() self.assertEqual(log_content_before, self.get_event_log_file("events.log"))
def test_filter_repair(self): failed_repaired_line = "2019-07-28T10:53:29+00:00 ip-10-0-167-91 !INFO | scylla.bin: [shard 0] repair - " \ "Got error in row level repair: std::runtime_error (repair id 1 is aborted on shard 0)" # 9 events in total: 2 events per filter x 3 filters + 3 events. with self.wait_for_n_events(self.get_events_logger(), count=9, timeout=3): with DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="repair's stream failed: streaming::stream_exception"), \ DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="Can not find stream_manager"), \ DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="is aborted"): DatabaseLogEvent.RUNTIME_ERROR().add_info( node="A", line_number=22, line=failed_repaired_line).publish() DatabaseLogEvent.RUNTIME_ERROR().add_info( node="A", line_number=22, line=failed_repaired_line).publish() DatabaseLogEvent.NO_SPACE_ERROR().add_info( node="B", line_number=22, line="not filtered").publish() log_content = self.get_event_log_file("events.log") self.assertIn("not filtered", log_content) self.assertNotIn("repair id 1", log_content)
def test_filter_expiration(self): with self.wait_for_n_events(self.get_events_logger(), count=5, timeout=10): line_prefix = f"{datetime.utcnow():%Y-%m-%dT%H:%M:%S+00:00}" with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR, node="A"): DatabaseLogEvent.NO_SPACE_ERROR() \ .add_info(node="A", line_number=22, line=line_prefix + " this is filtered") \ .publish() time.sleep(2) DatabaseLogEvent.NO_SPACE_ERROR() \ .add_info(node="A", line_number=22, line=line_prefix + " this is filtered") \ .publish() time.sleep(2) line_prefix = f"{datetime.utcnow():%Y-%m-%dT%H:%M:%S+00:00}" DatabaseLogEvent.NO_SPACE_ERROR() \ .add_info(node="A", line_number=22, line=line_prefix + " this is not filtered") \ .publish() log_content = self.get_event_log_file("events.log") self.assertIn("this is not filtered", log_content) self.assertNotIn("this is filtered", log_content)
def test_reactor_stalled_severity(self): event1 = DatabaseLogEvent.REACTOR_STALLED() self.assertEqual(event1.severity, Severity.WARNING) self.assertIs( event1, event1.add_info(node="n1", line=f"{TOLERABLE_REACTOR_STALL} ms", line_number=1)) self.assertEqual(event1.severity, Severity.NORMAL) self.assertEqual(event1.node, "n1") self.assertEqual(event1.line, f"{TOLERABLE_REACTOR_STALL} ms") self.assertEqual(event1.line_number, 1) event2 = DatabaseLogEvent.REACTOR_STALLED() self.assertEqual(event2.severity, Severity.WARNING) self.assertIs( event2, event2.add_info(node="n2", line=f"{TOLERABLE_REACTOR_STALL+1} ms", line_number=2)) self.assertEqual(event2.severity, Severity.WARNING) self.assertEqual(event2.node, "n2") self.assertEqual(event2.line, f"{TOLERABLE_REACTOR_STALL+1} ms") self.assertEqual(event2.line_number, 2)
def test_eval_filter_event_class_and_regex(self): filter = EventsFilter(event_class=DatabaseLogEvent.BAD_ALLOC, regex=".*xyz.*") event1 = DatabaseLogEvent.BAD_ALLOC().add_info(node="node1", line="xyz", line_number=1) event2 = DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="node1", line="xyz", line_number=1) event3 = DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="node1", line="abc", line_number=1) self.assertTrue(filter.eval_filter(event1)) self.assertFalse(filter.eval_filter(event2)) self.assertFalse(filter.eval_filter(event3))
def test_eval_filter_just_type(self): filter = DbEventsFilter(db_event=DatabaseLogEvent.REACTOR_STALLED) self.assertEqual(filter, pickle.loads(pickle.dumps(filter))) filter.to_json() event1 = DatabaseLogEvent.REACTOR_STALLED() event2 = DatabaseLogEvent.NO_SPACE_ERROR() self.assertTrue(filter.eval_filter(event1)) self.assertFalse(filter.eval_filter(event2))
def test_eval_filter_type_with_line_and_node(self): filter = DbEventsFilter(db_event=DatabaseLogEvent.BAD_ALLOC, node="node1", line="y") event1 = DatabaseLogEvent.BAD_ALLOC().add_info(node="node1", line="xyz", line_number=1) event2 = event1.clone().add_info(node="node1", line="abc", line_number=1) event3 = DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="node1", line="xyz", line_number=1) self.assertTrue(filter.eval_filter(event1)) self.assertFalse(filter.eval_filter(event2)) self.assertFalse(filter.eval_filter(event3))
def test_eval_filter_event_class_common_parent(self): filter = EventsFilter(event_class=DatabaseLogEvent) self.assertEqual(filter, pickle.loads(pickle.dumps(filter))) filter.to_json() event1 = DatabaseLogEvent.BAD_ALLOC() event2 = DatabaseLogEvent.NO_SPACE_ERROR() self.assertTrue(filter.eval_filter(event1)) self.assertTrue(filter.eval_filter(event2))
def test_eval_filter_event_class(self): db_events_filter = EventsFilter(event_class=DatabaseLogEvent.BAD_ALLOC) self.assertEqual(db_events_filter, pickle.loads(pickle.dumps(db_events_filter))) db_events_filter.to_json() event1 = DatabaseLogEvent.BAD_ALLOC() event2 = DatabaseLogEvent.NO_SPACE_ERROR() self.assertTrue(db_events_filter.eval_filter(event1)) self.assertFalse(db_events_filter.eval_filter(event2))
def test_filter_by_node(self): with self.wait_for_n_events(self.get_events_logger(), count=4, timeout=3): with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR, node="A"): DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="A", line_number=22, line="this is filtered").publish() DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="B", line_number=22, line="not filtered").publish() log_content = self.get_event_log_file("events.log") self.assertIn("not filtered", log_content) self.assertNotIn("this is filtered", log_content)
def test_kernel_callstack_severity(self): event1 = DatabaseLogEvent.KERNEL_CALLSTACK() self.assertEqual(event1.severity, Severity.DEBUG) self.assertIs(event1, event1.add_info(node="n1", line="kernel callstack 0xffffffffffffff80", line_number=1)) self.assertEqual(event1.node, "n1") self.assertEqual(event1.line_number, 1) event2 = DatabaseLogEvent.REACTOR_STALLED() self.assertEqual(event2.severity, Severity.DEBUG) self.assertIs(event2, event2.add_info(node="n2", line="kernel callstack 0xffffffffffffff80", line_number=2)) self.assertEqual(event2.node, "n2") self.assertEqual(event2.line_number, 2)
def test_filter_upgrade(self): known_failure_line = "!ERR | scylla: [shard 3] storage_proxy - Exception when communicating with " \ "10.142.0.56: std::runtime_error (Failed to load schema version " \ "b40e405f-462c-38f2-a90c-6f130ddbf6f3) " with self.wait_for_n_events(self.get_events_logger(), count=5, timeout=3): with DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="Failed to load schema"): DatabaseLogEvent.RUNTIME_ERROR().add_info(node="A", line_number=22, line=known_failure_line).publish() DatabaseLogEvent.RUNTIME_ERROR().add_info(node="A", line_number=22, line=known_failure_line).publish() DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="B", line_number=22, line="not filtered").publish() log_content = self.get_event_log_file("events.log") self.assertIn("not filtered", log_content) self.assertNotIn("Exception when communicating", log_content)
def test_default_filters(self): with self.wait_for_n_events(self.get_events_logger(), count=2): DatabaseLogEvent.BACKTRACE() \ .add_info(node="A", line_number=22, line="Jul 01 03:37:31 ip-10-0-127-151.eu-west-1.compute.internal" " scylla[6026]:Rate-limit: supressed 4294967292 backtraces on shard 5") \ .publish() DatabaseLogEvent.BACKTRACE() \ .add_info(node="A", line_number=22, line="other back trace that shouldn't be filtered") \ .publish() log_content = self.get_event_log_file("events.log") self.assertIn("other back trace", log_content) self.assertNotIn("supressed", log_content)
def test_eval_filter(self): db_events_filter = EventsSeverityChangerFilter( new_severity=Severity.NORMAL, event_class=DatabaseLogEvent) event = DatabaseLogEvent.BAD_ALLOC() self.assertEqual(event.severity, Severity.ERROR) db_events_filter.eval_filter(event) self.assertEqual(event.severity, Severity.NORMAL)
def test_failed_stall_during_filter(self): with self.wait_for_n_events(self.get_events_logger(), count=5, timeout=3): with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR), \ DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE, line="No space left on device"): event = DatabaseLogEvent.REACTOR_STALLED() event.add_info(node="A", line_number=22, line="[99.80.124.204] [stdout] Mar 31 09:08:10 warning| reactor stall 20").publish() self.assertEqual(event.severity, Severity.DEBUG)
def test_search_power_off(self): self.node.system_log = os.path.join(os.path.dirname(__file__), 'test_data', 'power_off.log') with DbEventsFilter(db_event=DatabaseLogEvent.POWER_OFF, node=self.node): self.node._read_system_log_and_publish_events( start_from_beginning=True) DatabaseLogEvent.POWER_OFF().add_info( node="A", line_number=22, line= f"{datetime.utcfromtimestamp(time.time() + 1):%Y-%m-%dT%H:%M:%S+00:00} " "longevity-large-collections-12h-mas-db-node-c6a4e04e-1 !INFO | systemd-logind: Powering Off..." ).publish() time.sleep(0.1) with self.get_events_logger().events_logs_by_severity[ Severity.CRITICAL].open() as events_file: events = [line for line in events_file if 'Powering Off' in line] assert events
def test_disk_error_event(self): # pylint: disable=line-too-long disk_error_event = DatabaseLogEvent.DISK_ERROR() # pylint: disable=line-too-long log_lines = """2022-02-07T06:13:14+00:00 longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4 ! INFO | [shard 8] compaction - [Compact keyspace1.standard1 089530c0-87dd-11ec-8382-519d84e34cb0] Compacting [/var/lib/scylla/data/keyspace1/standard1-b8e41570875811ec8382519d84e34cb0/md-287128-big-Data.db:level=2:origin=compaction,/var/lib/scylla/data/keyspace1/standard1-b8e41570875811ec8382519d84e34cb0/md-287112-big-Data.db:level=2:origin=compaction,/var/lib/scylla/data/keyspace1/standard1-b8e41570875811ec8382519d84e34cb0/md-290136-big-Data.db:level=2:origin=compaction,/var/lib/scylla/data/keyspace1/standard1-b8e41570875811ec8382519d84e34cb0/md-291192-big-Data.db:level=1:origin=compaction] 2022-02-07T06:13:14+00:00 longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4 ! ERR | blk_update_request: critical medium error, dev nvme0n11, sector 4141328 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0 2022-02-07T06:13:14+00:00 longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4 ! ERR | [shard 6] storage_service - Shutting down communications due to I/O errors until operator intervention: Disk error: std::system_error (error system:61, No data available) 2022-02-07T06:13:14+00:00 longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4 ! INFO | [shard 0] storage_service - Stop transport: starts 2022-02-07T06:13:14+00:00 longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4 ! INFO | [shard 0] storage_service - Shutting down native transport """ expected_error_data = { "line_number": 2, "line": "2022-02-07T06:13:14+00:00 longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4 ! ERR | [shard 6] storage_service - Shutting down communications due to I/O errors until operator intervention: Disk error: std::system_error (error system:61, No data available)", "node": "longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4" } for num, line in enumerate(log_lines.splitlines()): if re.search(disk_error_event.regex, line): disk_error_event.add_info("longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4", line, num) self.assertEqual(expected_error_data["node"], disk_error_event.node) self.assertEqual(expected_error_data["line_number"], disk_error_event.line_number) self.assertEqual(expected_error_data["line"], disk_error_event.line)