def ignore_no_space_errors(node):
    with ExitStack() as stack:
        stack.enter_context(
            DbEventsFilter(
                db_event=DatabaseLogEvent.NO_SPACE_ERROR,
                node=node,
            ))
        stack.enter_context(
            DbEventsFilter(
                db_event=DatabaseLogEvent.BACKTRACE,
                line="No space left on device",
                node=node,
            ))
        stack.enter_context(
            DbEventsFilter(
                db_event=DatabaseLogEvent.DATABASE_ERROR,
                line="No space left on device",
                node=node,
            ))
        stack.enter_context(
            DbEventsFilter(
                db_event=DatabaseLogEvent.FILESYSTEM_ERROR,
                line="No space left on device",
                node=node,
            ))
        yield
def ignore_upgrade_schema_errors():
    with ExitStack() as stack:
        stack.enter_context(
            DbEventsFilter(
                db_event=DatabaseLogEvent.DATABASE_ERROR,
                line="Failed to load schema",
            ))
        stack.enter_context(
            DbEventsFilter(
                db_event=DatabaseLogEvent.SCHEMA_FAILURE,
                line="Failed to load schema",
            ))
        stack.enter_context(
            DbEventsFilter(
                db_event=DatabaseLogEvent.DATABASE_ERROR,
                line="Failed to pull schema",
            ))
        stack.enter_context(
            DbEventsFilter(
                db_event=DatabaseLogEvent.RUNTIME_ERROR,
                line="Failed to load schema",
            ))
        # This error message occurs during version rating only for the Drain operating system.
        stack.enter_context(
            DbEventsFilter(
                db_event=DatabaseLogEvent.DATABASE_ERROR,
                line=
                "cql_server - exception while processing connection: seastar::nested_exception "
                "(seastar::nested_exception)",
            ))
        yield
Ejemplo n.º 3
0
    def test_filter_repair(self):
        failed_repaired_line = "2019-07-28T10:53:29+00:00  ip-10-0-167-91 !INFO    | scylla.bin: [shard 0] repair - " \
                               "Got error in row level repair: std::runtime_error (repair id 1 is aborted on shard 0)"

        # 9 events in total: 2 events per filter x 3 filters + 3 events.
        with self.wait_for_n_events(self.get_events_logger(),
                                    count=9,
                                    timeout=3):
            with DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR,
                                line="repair's stream failed: streaming::stream_exception"), \
                    DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR,
                                   line="Can not find stream_manager"), \
                    DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="is aborted"):

                DatabaseLogEvent.RUNTIME_ERROR().add_info(
                    node="A", line_number=22,
                    line=failed_repaired_line).publish()
                DatabaseLogEvent.RUNTIME_ERROR().add_info(
                    node="A", line_number=22,
                    line=failed_repaired_line).publish()
                DatabaseLogEvent.NO_SPACE_ERROR().add_info(
                    node="B", line_number=22, line="not filtered").publish()

        log_content = self.get_event_log_file("events.log")

        self.assertIn("not filtered", log_content)
        self.assertNotIn("repair id 1", log_content)
Ejemplo n.º 4
0
    def test_filter(self):
        enospc_line_1 = \
            "[99.80.124.204] [stdout] Mar 31 09:08:10 warning|  [shard 8] commitlog - Exception in segment " \
            "reservation: storage_io_error (Storage I/O error: 28: No space left on device)"
        enospc_line_2 = \
            "2019-10-29T12:19:49+00:00  ip-172-30-0-184 !WARNING | scylla: [shard 2] storage_service - Commitlog " \
            "error: std::filesystem::__cxx11::filesystem_error (error system:28, filesystem error: open failed: No " \
            "space left on device [/var/lib/scylla/hints/2/172.30.0.116/HintsLog-1-36028797019122576.log])"

        log_content_before = self.get_event_log_file("events.log")

        # 13 events in total: 2 events per filter x 4 filters + 5 events.
        with self.wait_for_n_events(self.get_events_logger(),
                                    count=13,
                                    timeout=3):
            with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR), \
                    DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE, line="No space left on device"), \
                    DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="No space left on device"), \
                    DbEventsFilter(db_event=DatabaseLogEvent.FILESYSTEM_ERROR, line="No space left on device"):
                DatabaseLogEvent.NO_SPACE_ERROR().add_info(
                    node="A", line_number=22, line=enospc_line_1).publish()
                DatabaseLogEvent.BACKTRACE().add_info(
                    node="A", line_number=22, line=enospc_line_1).publish()
                DatabaseLogEvent.FILESYSTEM_ERROR().add_info(
                    node="A", line_number=22, line=enospc_line_2).publish()
                DatabaseLogEvent.DATABASE_ERROR().add_info(
                    node="A", line_number=22, line=enospc_line_1).publish()
                DatabaseLogEvent.NO_SPACE_ERROR().add_info(
                    node="A", line_number=22, line=enospc_line_1).publish()

        self.assertEqual(log_content_before,
                         self.get_event_log_file("events.log"))
Ejemplo n.º 5
0
def start_events_device(
        log_dir: Optional[Union[str, Path]] = None,
        _registry: Optional[EventsProcessesRegistry] = None) -> None:
    if _registry is None:
        if log_dir is None:
            raise RuntimeError(
                "Should provide log_dir or instance of EventsProcessesRegistry"
            )
        _registry = create_default_events_process_registry(log_dir=log_dir)

    start_events_main_device(_registry=_registry)

    time.sleep(EVENTS_DEVICE_START_DELAY)

    start_events_logger(_registry=_registry)
    start_grafana_pipeline(_registry=_registry)
    start_events_analyzer(_registry=_registry)

    time.sleep(EVENTS_SUBSCRIBERS_START_DELAY)

    # Default filters.
    EventsSeverityChangerFilter(
        new_severity=Severity.WARNING,
        event_class=DatabaseLogEvent.DATABASE_ERROR,
        regex=
        r'.*workload prioritization - update_service_levels_from_distributed_data: an '
        r'error occurred while retrieving configuration').publish()
    DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE,
                   line='Rate-limit: supressed').publish()
    DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE,
                   line='Rate-limit: suppressed').publish()

    atexit.register(stop_events_device, _registry=_registry)
    def test_client_encryption(self):
        self.log.info('starting test_client_encryption')
        manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0])
        mgr_cluster = manager_tool.add_cluster(name=self.CLUSTER_NAME+"_encryption", db_cluster=self.db_cluster,
                                               auth_token=self.monitors.mgmt_auth_token)
        self.generate_load_and_wait_for_results()
        repair_task = mgr_cluster.create_repair_task(fail_fast=True)
        dict_host_health = mgr_cluster.get_hosts_health()
        for host_health in dict_host_health.values():
            assert host_health.ssl == HostSsl.OFF, "Not all hosts ssl is 'OFF'"

        with DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="failed to do checksum for"), \
                DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="failed to do checksum for"), \
                DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="Reactor stalled"), \
                DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="get_repair_meta: repair_meta_id"):

            self.db_cluster.enable_client_encrypt()

        mgr_cluster.update(client_encrypt=True)
        repair_task.start()
        sleep = 40
        self.log.debug('Sleep {} seconds, waiting for health-check task to run by schedule on first time'.format(sleep))
        time.sleep(sleep)
        healthcheck_task = mgr_cluster.get_healthcheck_task()
        self.log.debug("Health-check task history is: {}".format(healthcheck_task.history))
        dict_host_health = mgr_cluster.get_hosts_health()
        for host_health in dict_host_health.values():
            assert host_health.ssl == HostSsl.ON, "Not all hosts ssl is 'ON'"
            assert host_health.status == HostStatus.UP, "Not all hosts status is 'UP'"
        self.log.info('finishing test_client_encryption')
Ejemplo n.º 7
0
def start_events_device(
        log_dir: Optional[Union[str, Path]] = None,
        _registry: Optional[EventsProcessesRegistry] = None) -> None:
    if _registry is None:
        if log_dir is None:
            raise RuntimeError(
                "Should provide log_dir or instance of EventsProcessesRegistry"
            )
        _registry = create_default_events_process_registry(log_dir=log_dir)

    start_events_main_device(_registry=_registry)

    time.sleep(EVENTS_DEVICE_START_DELAY)

    start_events_logger(_registry=_registry)
    start_grafana_pipeline(_registry=_registry)
    start_events_analyzer(_registry=_registry)

    time.sleep(EVENTS_SUBSCRIBERS_START_DELAY)

    # Default filters.
    DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE,
                   line='Rate-limit: supressed').publish()
    DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE,
                   line='Rate-limit: suppressed').publish()

    atexit.register(stop_events_device, _registry=_registry)
 def test_eval_filter_type_with_line_and_node(self):
     filter = DbEventsFilter(db_event=DatabaseLogEvent.BAD_ALLOC, node="node1", line="y")
     event1 = DatabaseLogEvent.BAD_ALLOC().add_info(node="node1", line="xyz", line_number=1)
     event2 = event1.clone().add_info(node="node1", line="abc", line_number=1)
     event3 = DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="node1", line="xyz", line_number=1)
     self.assertTrue(filter.eval_filter(event1))
     self.assertFalse(filter.eval_filter(event2))
     self.assertFalse(filter.eval_filter(event3))
def ignore_scrub_invalid_errors():
    with ExitStack() as stack:
        stack.enter_context(DbEventsFilter(
            db_event=DatabaseLogEvent.DATABASE_ERROR,
            line="Skipping invalid clustering row fragment",
        ))
        stack.enter_context(DbEventsFilter(
            db_event=DatabaseLogEvent.DATABASE_ERROR,
            line="Skipping invalid partition",
        ))
        yield
Ejemplo n.º 10
0
    def test_failed_stall_during_filter(self):
        with self.wait_for_n_events(self.get_events_logger(), count=5, timeout=3):
            with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR), \
                    DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE, line="No space left on device"):

                event = DatabaseLogEvent.REACTOR_STALLED()
                event.add_info(node="A",
                               line_number=22,
                               line="[99.80.124.204] [stdout] Mar 31 09:08:10 warning|  reactor stall 20").publish()

        self.assertEqual(event.severity, Severity.DEBUG)
Ejemplo n.º 11
0
    def test_filter_expiration(self):
        with self.wait_for_n_events(self.get_events_logger(), count=5, timeout=10):
            line_prefix = f"{datetime.utcnow():%Y-%m-%dT%H:%M:%S+00:00}"

            with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR, node="A"):
                DatabaseLogEvent.NO_SPACE_ERROR() \
                    .add_info(node="A", line_number=22, line=line_prefix + " this is filtered") \
                    .publish()

            time.sleep(2)

            DatabaseLogEvent.NO_SPACE_ERROR() \
                .add_info(node="A", line_number=22, line=line_prefix + " this is filtered") \
                .publish()

            time.sleep(2)

            line_prefix = f"{datetime.utcnow():%Y-%m-%dT%H:%M:%S+00:00}"

            DatabaseLogEvent.NO_SPACE_ERROR() \
                .add_info(node="A", line_number=22, line=line_prefix + " this is not filtered") \
                .publish()

        log_content = self.get_event_log_file("events.log")

        self.assertIn("this is not filtered", log_content)
        self.assertNotIn("this is filtered", log_content)
Ejemplo n.º 12
0
    def test_filter_by_node(self):
        with self.wait_for_n_events(self.get_events_logger(), count=4, timeout=3):
            with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR, node="A"):
                DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="A", line_number=22, line="this is filtered").publish()
                DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="B", line_number=22, line="not filtered").publish()

        log_content = self.get_event_log_file("events.log")

        self.assertIn("not filtered", log_content)
        self.assertNotIn("this is filtered", log_content)
Ejemplo n.º 13
0
def ignore_upgrade_schema_errors():
    with ExitStack() as stack:
        stack.enter_context(DbEventsFilter(
            db_event=DatabaseLogEvent.DATABASE_ERROR,
            line="Failed to load schema",
        ))
        stack.enter_context(DbEventsFilter(
            db_event=DatabaseLogEvent.SCHEMA_FAILURE,
            line="Failed to load schema",
        ))
        stack.enter_context(DbEventsFilter(
            db_event=DatabaseLogEvent.DATABASE_ERROR,
            line="Failed to pull schema",
        ))
        stack.enter_context(DbEventsFilter(
            db_event=DatabaseLogEvent.RUNTIME_ERROR,
            line="Failed to load schema",
        ))
        yield
 def test_eval_filter_just_type(self):
     filter = DbEventsFilter(db_event=DatabaseLogEvent.REACTOR_STALLED)
     self.assertEqual(filter, pickle.loads(pickle.dumps(filter)))
     filter.to_json()
     event1 = DatabaseLogEvent.REACTOR_STALLED()
     event2 = DatabaseLogEvent.NO_SPACE_ERROR()
     self.assertTrue(filter.eval_filter(event1))
     self.assertFalse(filter.eval_filter(event2))
Ejemplo n.º 15
0
    def test_ignore_power_off(self):
        self.node.system_log = os.path.join(os.path.dirname(__file__),
                                            'test_data', 'power_off.log')
        with DbEventsFilter(db_event=DatabaseLogEvent.POWER_OFF,
                            node=self.node):
            self.node._read_system_log_and_publish_events(
                start_from_beginning=True)

            time.sleep(0.1)
            with self.get_events_logger().events_logs_by_severity[
                    Severity.CRITICAL].open() as events_file:
                events = [
                    line for line in events_file if 'Powering Off' in line
                ]
                assert events == []
Ejemplo n.º 16
0
    def test_filter_upgrade(self):
        known_failure_line = "!ERR     | scylla:  [shard 3] storage_proxy - Exception when communicating with " \
                             "10.142.0.56: std::runtime_error (Failed to load schema version " \
                             "b40e405f-462c-38f2-a90c-6f130ddbf6f3) "

        with self.wait_for_n_events(self.get_events_logger(), count=5, timeout=3):
            with DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="Failed to load schema"):
                DatabaseLogEvent.RUNTIME_ERROR().add_info(node="A", line_number=22, line=known_failure_line).publish()
                DatabaseLogEvent.RUNTIME_ERROR().add_info(node="A", line_number=22, line=known_failure_line).publish()
                DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="B", line_number=22, line="not filtered").publish()

        log_content = self.get_event_log_file("events.log")

        self.assertIn("not filtered", log_content)
        self.assertNotIn("Exception when communicating", log_content)
Ejemplo n.º 17
0
    def test_search_power_off(self):
        self.node.system_log = os.path.join(os.path.dirname(__file__),
                                            'test_data', 'power_off.log')
        with DbEventsFilter(db_event=InstanceStatusEvent.POWER_OFF,
                            node=self.node):
            self._read_and_publish_events()

        InstanceStatusEvent.POWER_OFF().add_info(
            node="A",
            line_number=22,
            line=
            f"{datetime.utcfromtimestamp(time.time() + 1):%Y-%m-%dT%H:%M:%S+00:00} "
            "longevity-large-collections-12h-mas-db-node-c6a4e04e-1 !INFO    | systemd-logind: Powering Off..."
        ).publish()

        time.sleep(0.1)
        with self.get_events_logger().events_logs_by_severity[
                Severity.WARNING].open() as events_file:
            events = [line for line in events_file if 'Powering Off' in line]
            assert events
Ejemplo n.º 18
0
    def restart(self):
        # We differentiate between "Restart" and "Reboot".
        # Restart in AWS will be a Stop and Start of an instance.
        # When using storage optimized instances like i2 or i3, the data on disk is deleted upon STOP.  Therefore, we
        # need to setup the instance and treat it as a new instance.
        if self._instance.spot_instance_request_id:
            LOGGER.debug("target node is spot instance, impossible to stop this instance, skipping the restart")
            return

        if self.is_seed:
            # Due to https://github.com/scylladb/scylla/issues/7588, when we restart a node that is defined as "seed",
            # we must state a different, living node as the seed provider in the scylla yaml of the restarted node
            other_nodes = list(set(self.parent_cluster.nodes) - {self})
            free_nodes = [node for node in other_nodes if not node.running_nemesis]
            random_node = random.choice(free_nodes)
            seed_provider = SeedProvider(
                class_name="org.apache.cassandra.locator.SimpleSeedProvider",
                parameters=[{"seeds": f"{random_node.ip_address}"}]
            )

            with self.remote_scylla_yaml() as scylla_yml:
                scylla_yml.seed_provider = [seed_provider]

        with ExitStack() as stack:
            if self.is_data_device_lost_after_reboot:
                # There is no disk yet, lots of the errors here are acceptable, and we'll ignore them.
                for db_filter in (DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, node=self),
                                  DbEventsFilter(db_event=DatabaseLogEvent.SCHEMA_FAILURE, node=self),
                                  DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR, node=self),
                                  DbEventsFilter(db_event=DatabaseLogEvent.FILESYSTEM_ERROR, node=self),
                                  DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, node=self), ):
                    stack.enter_context(db_filter)

                self.remoter.sudo(shell_script_cmd(f"""\
                    sed -e '/.*scylla/s/^/#/g' -i /etc/fstab
                    sed -e '/auto_bootstrap:.*/s/false/true/g' -i /etc/scylla/scylla.yaml
                    if ! grep ^replace_address_first_boot: /etc/scylla/scylla.yaml; then
                        echo 'replace_address_first_boot: {self.ip_address}' | tee --append /etc/scylla/scylla.yaml
                    fi
                """))

            self._instance.stop()
            self._instance_wait_safe(self._instance.wait_until_stopped)
            self._instance.start()
            self._instance_wait_safe(self._instance.wait_until_running)
            self._wait_public_ip()

            self.log.debug("Got a new public IP: %s", self._instance.public_ip_address)

            self.refresh_ip_address()
            self.wait_ssh_up()

            if self.is_data_device_lost_after_reboot:
                self.stop_scylla_server(verify_down=False)

                # Moving var-lib-scylla.mount away, since scylla_create_devices fails if it already exists
                mount_path = "/etc/systemd/system/var-lib-scylla.mount"
                if self.remoter.sudo(f"test -e {mount_path}", ignore_status=True).ok:
                    self.remoter.sudo(f"mv {mount_path} /tmp/")

                # The scylla_create_devices has been moved to the '/opt/scylladb' folder in the master branch.
                for create_devices_file in ("/usr/lib/scylla/scylla-ami/scylla_create_devices",
                                            "/opt/scylladb/scylla-ami/scylla_create_devices",
                                            "/opt/scylladb/scylla-machine-image/scylla_create_devices", ):
                    if self.remoter.sudo(f"test -x {create_devices_file}", ignore_status=True).ok:
                        self.remoter.sudo(create_devices_file)
                        break
                else:
                    raise IOError("scylla_create_devices file isn't found")

                self.start_scylla_server(verify_up=False)

                self.remoter.sudo(shell_script_cmd("""\
                    sed -e '/auto_bootstrap:.*/s/true/false/g' -i /etc/scylla/scylla.yaml
                    sed -e 's/^replace_address_first_boot:/# replace_address_first_boot:/g' -i /etc/scylla/scylla.yaml
                """))