Ejemplo n.º 1
0
    def test_severity_changer_db_log(self):
        """
            See https://github.com/scylladb/scylla-cluster-tests/issues/2115
        """
        # 1) Lower DatabaseLogEvent to WARNING for 1 sec.
        with self.wait_for_n_events(self.get_events_logger(), count=4, timeout=3):
            with EventsSeverityChangerFilter(new_severity=Severity.WARNING,
                                             event_class=DatabaseLogEvent,
                                             extra_time_to_expiration=1):
                DatabaseLogEvent.NO_SPACE_ERROR() \
                    .add_info(node="A", line_number=22, line="critical that should be lowered #1") \
                    .publish()
            DatabaseLogEvent.NO_SPACE_ERROR() \
                .add_info(node="A", line_number=22, line="critical that should be lowered #2") \
                .publish()

        log_content = self.get_event_log_file("warning.log")

        self.assertIn("DatabaseLogEvent", log_content)
        self.assertIn("critical that should be lowered #1", log_content)
        self.assertIn("critical that should be lowered #2", log_content)

        # 2) One of the next DatabaseLogEvent event should expire the EventsSeverityChangerFilter
        #    (and not crash all subscribers)
        with self.wait_for_n_events(self.get_events_logger(), count=2, timeout=3):
            for _ in range(2):
                time.sleep(1)
                DatabaseLogEvent.NO_SPACE_ERROR() \
                    .add_info(node="A", line_number=22, line="critical that shouldn't be lowered") \
                    .publish()

        log_content = self.get_event_log_file("error.log")

        self.assertIn("critical that shouldn't be lowered", log_content)
Ejemplo n.º 2
0
    def test_filter(self):
        enospc_line_1 = \
            "[99.80.124.204] [stdout] Mar 31 09:08:10 warning|  [shard 8] commitlog - Exception in segment " \
            "reservation: storage_io_error (Storage I/O error: 28: No space left on device)"
        enospc_line_2 = \
            "2019-10-29T12:19:49+00:00  ip-172-30-0-184 !WARNING | scylla: [shard 2] storage_service - Commitlog " \
            "error: std::filesystem::__cxx11::filesystem_error (error system:28, filesystem error: open failed: No " \
            "space left on device [/var/lib/scylla/hints/2/172.30.0.116/HintsLog-1-36028797019122576.log])"

        log_content_before = self.get_event_log_file("events.log")

        # 13 events in total: 2 events per filter x 4 filters + 5 events.
        with self.wait_for_n_events(self.get_events_logger(),
                                    count=13,
                                    timeout=3):
            with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR), \
                    DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE, line="No space left on device"), \
                    DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="No space left on device"), \
                    DbEventsFilter(db_event=DatabaseLogEvent.FILESYSTEM_ERROR, line="No space left on device"):
                DatabaseLogEvent.NO_SPACE_ERROR().add_info(
                    node="A", line_number=22, line=enospc_line_1).publish()
                DatabaseLogEvent.BACKTRACE().add_info(
                    node="A", line_number=22, line=enospc_line_1).publish()
                DatabaseLogEvent.FILESYSTEM_ERROR().add_info(
                    node="A", line_number=22, line=enospc_line_2).publish()
                DatabaseLogEvent.DATABASE_ERROR().add_info(
                    node="A", line_number=22, line=enospc_line_1).publish()
                DatabaseLogEvent.NO_SPACE_ERROR().add_info(
                    node="A", line_number=22, line=enospc_line_1).publish()

        self.assertEqual(log_content_before,
                         self.get_event_log_file("events.log"))
Ejemplo n.º 3
0
    def test_filter_repair(self):
        failed_repaired_line = "2019-07-28T10:53:29+00:00  ip-10-0-167-91 !INFO    | scylla.bin: [shard 0] repair - " \
                               "Got error in row level repair: std::runtime_error (repair id 1 is aborted on shard 0)"

        # 9 events in total: 2 events per filter x 3 filters + 3 events.
        with self.wait_for_n_events(self.get_events_logger(),
                                    count=9,
                                    timeout=3):
            with DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR,
                                line="repair's stream failed: streaming::stream_exception"), \
                    DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR,
                                   line="Can not find stream_manager"), \
                    DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="is aborted"):

                DatabaseLogEvent.RUNTIME_ERROR().add_info(
                    node="A", line_number=22,
                    line=failed_repaired_line).publish()
                DatabaseLogEvent.RUNTIME_ERROR().add_info(
                    node="A", line_number=22,
                    line=failed_repaired_line).publish()
                DatabaseLogEvent.NO_SPACE_ERROR().add_info(
                    node="B", line_number=22, line="not filtered").publish()

        log_content = self.get_event_log_file("events.log")

        self.assertIn("not filtered", log_content)
        self.assertNotIn("repair id 1", log_content)
Ejemplo n.º 4
0
    def test_filter_expiration(self):
        with self.wait_for_n_events(self.get_events_logger(), count=5, timeout=10):
            line_prefix = f"{datetime.utcnow():%Y-%m-%dT%H:%M:%S+00:00}"

            with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR, node="A"):
                DatabaseLogEvent.NO_SPACE_ERROR() \
                    .add_info(node="A", line_number=22, line=line_prefix + " this is filtered") \
                    .publish()

            time.sleep(2)

            DatabaseLogEvent.NO_SPACE_ERROR() \
                .add_info(node="A", line_number=22, line=line_prefix + " this is filtered") \
                .publish()

            time.sleep(2)

            line_prefix = f"{datetime.utcnow():%Y-%m-%dT%H:%M:%S+00:00}"

            DatabaseLogEvent.NO_SPACE_ERROR() \
                .add_info(node="A", line_number=22, line=line_prefix + " this is not filtered") \
                .publish()

        log_content = self.get_event_log_file("events.log")

        self.assertIn("this is not filtered", log_content)
        self.assertNotIn("this is filtered", log_content)
Ejemplo n.º 5
0
    def test_reactor_stalled_severity(self):
        event1 = DatabaseLogEvent.REACTOR_STALLED()
        self.assertEqual(event1.severity, Severity.WARNING)

        self.assertIs(
            event1,
            event1.add_info(node="n1",
                            line=f"{TOLERABLE_REACTOR_STALL} ms",
                            line_number=1))
        self.assertEqual(event1.severity, Severity.NORMAL)
        self.assertEqual(event1.node, "n1")
        self.assertEqual(event1.line, f"{TOLERABLE_REACTOR_STALL} ms")
        self.assertEqual(event1.line_number, 1)

        event2 = DatabaseLogEvent.REACTOR_STALLED()
        self.assertEqual(event2.severity, Severity.WARNING)
        self.assertIs(
            event2,
            event2.add_info(node="n2",
                            line=f"{TOLERABLE_REACTOR_STALL+1} ms",
                            line_number=2))
        self.assertEqual(event2.severity, Severity.WARNING)
        self.assertEqual(event2.node, "n2")
        self.assertEqual(event2.line, f"{TOLERABLE_REACTOR_STALL+1} ms")
        self.assertEqual(event2.line_number, 2)
 def test_eval_filter_event_class_and_regex(self):
     filter = EventsFilter(event_class=DatabaseLogEvent.BAD_ALLOC, regex=".*xyz.*")
     event1 = DatabaseLogEvent.BAD_ALLOC().add_info(node="node1", line="xyz", line_number=1)
     event2 = DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="node1", line="xyz", line_number=1)
     event3 = DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="node1", line="abc", line_number=1)
     self.assertTrue(filter.eval_filter(event1))
     self.assertFalse(filter.eval_filter(event2))
     self.assertFalse(filter.eval_filter(event3))
 def test_eval_filter_just_type(self):
     filter = DbEventsFilter(db_event=DatabaseLogEvent.REACTOR_STALLED)
     self.assertEqual(filter, pickle.loads(pickle.dumps(filter)))
     filter.to_json()
     event1 = DatabaseLogEvent.REACTOR_STALLED()
     event2 = DatabaseLogEvent.NO_SPACE_ERROR()
     self.assertTrue(filter.eval_filter(event1))
     self.assertFalse(filter.eval_filter(event2))
 def test_eval_filter_type_with_line_and_node(self):
     filter = DbEventsFilter(db_event=DatabaseLogEvent.BAD_ALLOC, node="node1", line="y")
     event1 = DatabaseLogEvent.BAD_ALLOC().add_info(node="node1", line="xyz", line_number=1)
     event2 = event1.clone().add_info(node="node1", line="abc", line_number=1)
     event3 = DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="node1", line="xyz", line_number=1)
     self.assertTrue(filter.eval_filter(event1))
     self.assertFalse(filter.eval_filter(event2))
     self.assertFalse(filter.eval_filter(event3))
 def test_eval_filter_event_class_common_parent(self):
     filter = EventsFilter(event_class=DatabaseLogEvent)
     self.assertEqual(filter, pickle.loads(pickle.dumps(filter)))
     filter.to_json()
     event1 = DatabaseLogEvent.BAD_ALLOC()
     event2 = DatabaseLogEvent.NO_SPACE_ERROR()
     self.assertTrue(filter.eval_filter(event1))
     self.assertTrue(filter.eval_filter(event2))
 def test_eval_filter_event_class(self):
     db_events_filter = EventsFilter(event_class=DatabaseLogEvent.BAD_ALLOC)
     self.assertEqual(db_events_filter,
                      pickle.loads(pickle.dumps(db_events_filter)))
     db_events_filter.to_json()
     event1 = DatabaseLogEvent.BAD_ALLOC()
     event2 = DatabaseLogEvent.NO_SPACE_ERROR()
     self.assertTrue(db_events_filter.eval_filter(event1))
     self.assertFalse(db_events_filter.eval_filter(event2))
Ejemplo n.º 11
0
    def test_filter_by_node(self):
        with self.wait_for_n_events(self.get_events_logger(), count=4, timeout=3):
            with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR, node="A"):
                DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="A", line_number=22, line="this is filtered").publish()
                DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="B", line_number=22, line="not filtered").publish()

        log_content = self.get_event_log_file("events.log")

        self.assertIn("not filtered", log_content)
        self.assertNotIn("this is filtered", log_content)
    def test_kernel_callstack_severity(self):
        event1 = DatabaseLogEvent.KERNEL_CALLSTACK()
        self.assertEqual(event1.severity, Severity.DEBUG)

        self.assertIs(event1, event1.add_info(node="n1", line="kernel callstack 0xffffffffffffff80", line_number=1))
        self.assertEqual(event1.node, "n1")
        self.assertEqual(event1.line_number, 1)

        event2 = DatabaseLogEvent.REACTOR_STALLED()
        self.assertEqual(event2.severity, Severity.DEBUG)
        self.assertIs(event2, event2.add_info(node="n2", line="kernel callstack 0xffffffffffffff80", line_number=2))
        self.assertEqual(event2.node, "n2")
        self.assertEqual(event2.line_number, 2)
Ejemplo n.º 13
0
    def test_filter_upgrade(self):
        known_failure_line = "!ERR     | scylla:  [shard 3] storage_proxy - Exception when communicating with " \
                             "10.142.0.56: std::runtime_error (Failed to load schema version " \
                             "b40e405f-462c-38f2-a90c-6f130ddbf6f3) "

        with self.wait_for_n_events(self.get_events_logger(), count=5, timeout=3):
            with DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="Failed to load schema"):
                DatabaseLogEvent.RUNTIME_ERROR().add_info(node="A", line_number=22, line=known_failure_line).publish()
                DatabaseLogEvent.RUNTIME_ERROR().add_info(node="A", line_number=22, line=known_failure_line).publish()
                DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="B", line_number=22, line="not filtered").publish()

        log_content = self.get_event_log_file("events.log")

        self.assertIn("not filtered", log_content)
        self.assertNotIn("Exception when communicating", log_content)
Ejemplo n.º 14
0
    def test_default_filters(self):
        with self.wait_for_n_events(self.get_events_logger(), count=2):
            DatabaseLogEvent.BACKTRACE() \
                .add_info(node="A",
                          line_number=22,
                          line="Jul 01 03:37:31 ip-10-0-127-151.eu-west-1.compute.internal"
                               " scylla[6026]:Rate-limit: supressed 4294967292 backtraces on shard 5") \
                .publish()
            DatabaseLogEvent.BACKTRACE() \
                .add_info(node="A", line_number=22, line="other back trace that shouldn't be filtered") \
                .publish()

        log_content = self.get_event_log_file("events.log")

        self.assertIn("other back trace", log_content)
        self.assertNotIn("supressed", log_content)
 def test_eval_filter(self):
     db_events_filter = EventsSeverityChangerFilter(
         new_severity=Severity.NORMAL, event_class=DatabaseLogEvent)
     event = DatabaseLogEvent.BAD_ALLOC()
     self.assertEqual(event.severity, Severity.ERROR)
     db_events_filter.eval_filter(event)
     self.assertEqual(event.severity, Severity.NORMAL)
Ejemplo n.º 16
0
    def test_failed_stall_during_filter(self):
        with self.wait_for_n_events(self.get_events_logger(), count=5, timeout=3):
            with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR), \
                    DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE, line="No space left on device"):

                event = DatabaseLogEvent.REACTOR_STALLED()
                event.add_info(node="A",
                               line_number=22,
                               line="[99.80.124.204] [stdout] Mar 31 09:08:10 warning|  reactor stall 20").publish()

        self.assertEqual(event.severity, Severity.DEBUG)
    def test_search_power_off(self):
        self.node.system_log = os.path.join(os.path.dirname(__file__),
                                            'test_data', 'power_off.log')
        with DbEventsFilter(db_event=DatabaseLogEvent.POWER_OFF,
                            node=self.node):
            self.node._read_system_log_and_publish_events(
                start_from_beginning=True)

        DatabaseLogEvent.POWER_OFF().add_info(
            node="A",
            line_number=22,
            line=
            f"{datetime.utcfromtimestamp(time.time() + 1):%Y-%m-%dT%H:%M:%S+00:00} "
            "longevity-large-collections-12h-mas-db-node-c6a4e04e-1 !INFO    | systemd-logind: Powering Off..."
        ).publish()

        time.sleep(0.1)
        with self.get_events_logger().events_logs_by_severity[
                Severity.CRITICAL].open() as events_file:
            events = [line for line in events_file if 'Powering Off' in line]
            assert events
    def test_disk_error_event(self):  # pylint: disable=line-too-long

        disk_error_event = DatabaseLogEvent.DISK_ERROR()
        # pylint: disable=line-too-long
        log_lines = """2022-02-07T06:13:14+00:00 longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4 !    INFO |  [shard 8] compaction - [Compact keyspace1.standard1 089530c0-87dd-11ec-8382-519d84e34cb0] Compacting [/var/lib/scylla/data/keyspace1/standard1-b8e41570875811ec8382519d84e34cb0/md-287128-big-Data.db:level=2:origin=compaction,/var/lib/scylla/data/keyspace1/standard1-b8e41570875811ec8382519d84e34cb0/md-287112-big-Data.db:level=2:origin=compaction,/var/lib/scylla/data/keyspace1/standard1-b8e41570875811ec8382519d84e34cb0/md-290136-big-Data.db:level=2:origin=compaction,/var/lib/scylla/data/keyspace1/standard1-b8e41570875811ec8382519d84e34cb0/md-291192-big-Data.db:level=1:origin=compaction]
2022-02-07T06:13:14+00:00 longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4 !     ERR | blk_update_request: critical medium error, dev nvme0n11, sector 4141328 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0
2022-02-07T06:13:14+00:00 longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4 !     ERR |  [shard 6] storage_service - Shutting down communications due to I/O errors until operator intervention: Disk error: std::system_error (error system:61, No data available)
2022-02-07T06:13:14+00:00 longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4 !    INFO |  [shard 0] storage_service - Stop transport: starts
2022-02-07T06:13:14+00:00 longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4 !    INFO |  [shard 0] storage_service - Shutting down native transport
        """
        expected_error_data = {
            "line_number": 2,
            "line": "2022-02-07T06:13:14+00:00 longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4 !     ERR |  [shard 6] storage_service - Shutting down communications due to I/O errors until operator intervention: Disk error: std::system_error (error system:61, No data available)",
            "node": "longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4"
        }

        for num, line in enumerate(log_lines.splitlines()):
            if re.search(disk_error_event.regex, line):
                disk_error_event.add_info("longevity-tls-1tb-7d-4-6-db-node-5279f155-0-4",
                                          line, num)

        self.assertEqual(expected_error_data["node"], disk_error_event.node)
        self.assertEqual(expected_error_data["line_number"], disk_error_event.line_number)
        self.assertEqual(expected_error_data["line"], disk_error_event.line)