def ignore_no_space_errors(node): with ExitStack() as stack: stack.enter_context( DbEventsFilter( db_event=DatabaseLogEvent.NO_SPACE_ERROR, node=node, )) stack.enter_context( DbEventsFilter( db_event=DatabaseLogEvent.BACKTRACE, line="No space left on device", node=node, )) stack.enter_context( DbEventsFilter( db_event=DatabaseLogEvent.DATABASE_ERROR, line="No space left on device", node=node, )) stack.enter_context( DbEventsFilter( db_event=DatabaseLogEvent.FILESYSTEM_ERROR, line="No space left on device", node=node, )) yield
def ignore_upgrade_schema_errors(): with ExitStack() as stack: stack.enter_context( DbEventsFilter( db_event=DatabaseLogEvent.DATABASE_ERROR, line="Failed to load schema", )) stack.enter_context( DbEventsFilter( db_event=DatabaseLogEvent.SCHEMA_FAILURE, line="Failed to load schema", )) stack.enter_context( DbEventsFilter( db_event=DatabaseLogEvent.DATABASE_ERROR, line="Failed to pull schema", )) stack.enter_context( DbEventsFilter( db_event=DatabaseLogEvent.RUNTIME_ERROR, line="Failed to load schema", )) # This error message occurs during version rating only for the Drain operating system. stack.enter_context( DbEventsFilter( db_event=DatabaseLogEvent.DATABASE_ERROR, line= "cql_server - exception while processing connection: seastar::nested_exception " "(seastar::nested_exception)", )) yield
def test_filter_repair(self): failed_repaired_line = "2019-07-28T10:53:29+00:00 ip-10-0-167-91 !INFO | scylla.bin: [shard 0] repair - " \ "Got error in row level repair: std::runtime_error (repair id 1 is aborted on shard 0)" # 9 events in total: 2 events per filter x 3 filters + 3 events. with self.wait_for_n_events(self.get_events_logger(), count=9, timeout=3): with DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="repair's stream failed: streaming::stream_exception"), \ DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="Can not find stream_manager"), \ DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="is aborted"): DatabaseLogEvent.RUNTIME_ERROR().add_info( node="A", line_number=22, line=failed_repaired_line).publish() DatabaseLogEvent.RUNTIME_ERROR().add_info( node="A", line_number=22, line=failed_repaired_line).publish() DatabaseLogEvent.NO_SPACE_ERROR().add_info( node="B", line_number=22, line="not filtered").publish() log_content = self.get_event_log_file("events.log") self.assertIn("not filtered", log_content) self.assertNotIn("repair id 1", log_content)
def test_filter(self): enospc_line_1 = \ "[99.80.124.204] [stdout] Mar 31 09:08:10 warning| [shard 8] commitlog - Exception in segment " \ "reservation: storage_io_error (Storage I/O error: 28: No space left on device)" enospc_line_2 = \ "2019-10-29T12:19:49+00:00 ip-172-30-0-184 !WARNING | scylla: [shard 2] storage_service - Commitlog " \ "error: std::filesystem::__cxx11::filesystem_error (error system:28, filesystem error: open failed: No " \ "space left on device [/var/lib/scylla/hints/2/172.30.0.116/HintsLog-1-36028797019122576.log])" log_content_before = self.get_event_log_file("events.log") # 13 events in total: 2 events per filter x 4 filters + 5 events. with self.wait_for_n_events(self.get_events_logger(), count=13, timeout=3): with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR), \ DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE, line="No space left on device"), \ DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="No space left on device"), \ DbEventsFilter(db_event=DatabaseLogEvent.FILESYSTEM_ERROR, line="No space left on device"): DatabaseLogEvent.NO_SPACE_ERROR().add_info( node="A", line_number=22, line=enospc_line_1).publish() DatabaseLogEvent.BACKTRACE().add_info( node="A", line_number=22, line=enospc_line_1).publish() DatabaseLogEvent.FILESYSTEM_ERROR().add_info( node="A", line_number=22, line=enospc_line_2).publish() DatabaseLogEvent.DATABASE_ERROR().add_info( node="A", line_number=22, line=enospc_line_1).publish() DatabaseLogEvent.NO_SPACE_ERROR().add_info( node="A", line_number=22, line=enospc_line_1).publish() self.assertEqual(log_content_before, self.get_event_log_file("events.log"))
def start_events_device( log_dir: Optional[Union[str, Path]] = None, _registry: Optional[EventsProcessesRegistry] = None) -> None: if _registry is None: if log_dir is None: raise RuntimeError( "Should provide log_dir or instance of EventsProcessesRegistry" ) _registry = create_default_events_process_registry(log_dir=log_dir) start_events_main_device(_registry=_registry) time.sleep(EVENTS_DEVICE_START_DELAY) start_events_logger(_registry=_registry) start_grafana_pipeline(_registry=_registry) start_events_analyzer(_registry=_registry) time.sleep(EVENTS_SUBSCRIBERS_START_DELAY) # Default filters. EventsSeverityChangerFilter( new_severity=Severity.WARNING, event_class=DatabaseLogEvent.DATABASE_ERROR, regex= r'.*workload prioritization - update_service_levels_from_distributed_data: an ' r'error occurred while retrieving configuration').publish() DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE, line='Rate-limit: supressed').publish() DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE, line='Rate-limit: suppressed').publish() atexit.register(stop_events_device, _registry=_registry)
def test_client_encryption(self): self.log.info('starting test_client_encryption') manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0]) mgr_cluster = manager_tool.add_cluster(name=self.CLUSTER_NAME+"_encryption", db_cluster=self.db_cluster, auth_token=self.monitors.mgmt_auth_token) self.generate_load_and_wait_for_results() repair_task = mgr_cluster.create_repair_task(fail_fast=True) dict_host_health = mgr_cluster.get_hosts_health() for host_health in dict_host_health.values(): assert host_health.ssl == HostSsl.OFF, "Not all hosts ssl is 'OFF'" with DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="failed to do checksum for"), \ DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="failed to do checksum for"), \ DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="Reactor stalled"), \ DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="get_repair_meta: repair_meta_id"): self.db_cluster.enable_client_encrypt() mgr_cluster.update(client_encrypt=True) repair_task.start() sleep = 40 self.log.debug('Sleep {} seconds, waiting for health-check task to run by schedule on first time'.format(sleep)) time.sleep(sleep) healthcheck_task = mgr_cluster.get_healthcheck_task() self.log.debug("Health-check task history is: {}".format(healthcheck_task.history)) dict_host_health = mgr_cluster.get_hosts_health() for host_health in dict_host_health.values(): assert host_health.ssl == HostSsl.ON, "Not all hosts ssl is 'ON'" assert host_health.status == HostStatus.UP, "Not all hosts status is 'UP'" self.log.info('finishing test_client_encryption')
def start_events_device( log_dir: Optional[Union[str, Path]] = None, _registry: Optional[EventsProcessesRegistry] = None) -> None: if _registry is None: if log_dir is None: raise RuntimeError( "Should provide log_dir or instance of EventsProcessesRegistry" ) _registry = create_default_events_process_registry(log_dir=log_dir) start_events_main_device(_registry=_registry) time.sleep(EVENTS_DEVICE_START_DELAY) start_events_logger(_registry=_registry) start_grafana_pipeline(_registry=_registry) start_events_analyzer(_registry=_registry) time.sleep(EVENTS_SUBSCRIBERS_START_DELAY) # Default filters. DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE, line='Rate-limit: supressed').publish() DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE, line='Rate-limit: suppressed').publish() atexit.register(stop_events_device, _registry=_registry)
def test_eval_filter_type_with_line_and_node(self): filter = DbEventsFilter(db_event=DatabaseLogEvent.BAD_ALLOC, node="node1", line="y") event1 = DatabaseLogEvent.BAD_ALLOC().add_info(node="node1", line="xyz", line_number=1) event2 = event1.clone().add_info(node="node1", line="abc", line_number=1) event3 = DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="node1", line="xyz", line_number=1) self.assertTrue(filter.eval_filter(event1)) self.assertFalse(filter.eval_filter(event2)) self.assertFalse(filter.eval_filter(event3))
def ignore_scrub_invalid_errors(): with ExitStack() as stack: stack.enter_context(DbEventsFilter( db_event=DatabaseLogEvent.DATABASE_ERROR, line="Skipping invalid clustering row fragment", )) stack.enter_context(DbEventsFilter( db_event=DatabaseLogEvent.DATABASE_ERROR, line="Skipping invalid partition", )) yield
def test_failed_stall_during_filter(self): with self.wait_for_n_events(self.get_events_logger(), count=5, timeout=3): with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR), \ DbEventsFilter(db_event=DatabaseLogEvent.BACKTRACE, line="No space left on device"): event = DatabaseLogEvent.REACTOR_STALLED() event.add_info(node="A", line_number=22, line="[99.80.124.204] [stdout] Mar 31 09:08:10 warning| reactor stall 20").publish() self.assertEqual(event.severity, Severity.DEBUG)
def test_filter_expiration(self): with self.wait_for_n_events(self.get_events_logger(), count=5, timeout=10): line_prefix = f"{datetime.utcnow():%Y-%m-%dT%H:%M:%S+00:00}" with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR, node="A"): DatabaseLogEvent.NO_SPACE_ERROR() \ .add_info(node="A", line_number=22, line=line_prefix + " this is filtered") \ .publish() time.sleep(2) DatabaseLogEvent.NO_SPACE_ERROR() \ .add_info(node="A", line_number=22, line=line_prefix + " this is filtered") \ .publish() time.sleep(2) line_prefix = f"{datetime.utcnow():%Y-%m-%dT%H:%M:%S+00:00}" DatabaseLogEvent.NO_SPACE_ERROR() \ .add_info(node="A", line_number=22, line=line_prefix + " this is not filtered") \ .publish() log_content = self.get_event_log_file("events.log") self.assertIn("this is not filtered", log_content) self.assertNotIn("this is filtered", log_content)
def test_filter_by_node(self): with self.wait_for_n_events(self.get_events_logger(), count=4, timeout=3): with DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR, node="A"): DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="A", line_number=22, line="this is filtered").publish() DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="B", line_number=22, line="not filtered").publish() log_content = self.get_event_log_file("events.log") self.assertIn("not filtered", log_content) self.assertNotIn("this is filtered", log_content)
def ignore_upgrade_schema_errors(): with ExitStack() as stack: stack.enter_context(DbEventsFilter( db_event=DatabaseLogEvent.DATABASE_ERROR, line="Failed to load schema", )) stack.enter_context(DbEventsFilter( db_event=DatabaseLogEvent.SCHEMA_FAILURE, line="Failed to load schema", )) stack.enter_context(DbEventsFilter( db_event=DatabaseLogEvent.DATABASE_ERROR, line="Failed to pull schema", )) stack.enter_context(DbEventsFilter( db_event=DatabaseLogEvent.RUNTIME_ERROR, line="Failed to load schema", )) yield
def test_eval_filter_just_type(self): filter = DbEventsFilter(db_event=DatabaseLogEvent.REACTOR_STALLED) self.assertEqual(filter, pickle.loads(pickle.dumps(filter))) filter.to_json() event1 = DatabaseLogEvent.REACTOR_STALLED() event2 = DatabaseLogEvent.NO_SPACE_ERROR() self.assertTrue(filter.eval_filter(event1)) self.assertFalse(filter.eval_filter(event2))
def test_ignore_power_off(self): self.node.system_log = os.path.join(os.path.dirname(__file__), 'test_data', 'power_off.log') with DbEventsFilter(db_event=DatabaseLogEvent.POWER_OFF, node=self.node): self.node._read_system_log_and_publish_events( start_from_beginning=True) time.sleep(0.1) with self.get_events_logger().events_logs_by_severity[ Severity.CRITICAL].open() as events_file: events = [ line for line in events_file if 'Powering Off' in line ] assert events == []
def test_filter_upgrade(self): known_failure_line = "!ERR | scylla: [shard 3] storage_proxy - Exception when communicating with " \ "10.142.0.56: std::runtime_error (Failed to load schema version " \ "b40e405f-462c-38f2-a90c-6f130ddbf6f3) " with self.wait_for_n_events(self.get_events_logger(), count=5, timeout=3): with DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="Failed to load schema"): DatabaseLogEvent.RUNTIME_ERROR().add_info(node="A", line_number=22, line=known_failure_line).publish() DatabaseLogEvent.RUNTIME_ERROR().add_info(node="A", line_number=22, line=known_failure_line).publish() DatabaseLogEvent.NO_SPACE_ERROR().add_info(node="B", line_number=22, line="not filtered").publish() log_content = self.get_event_log_file("events.log") self.assertIn("not filtered", log_content) self.assertNotIn("Exception when communicating", log_content)
def test_search_power_off(self): self.node.system_log = os.path.join(os.path.dirname(__file__), 'test_data', 'power_off.log') with DbEventsFilter(db_event=InstanceStatusEvent.POWER_OFF, node=self.node): self._read_and_publish_events() InstanceStatusEvent.POWER_OFF().add_info( node="A", line_number=22, line= f"{datetime.utcfromtimestamp(time.time() + 1):%Y-%m-%dT%H:%M:%S+00:00} " "longevity-large-collections-12h-mas-db-node-c6a4e04e-1 !INFO | systemd-logind: Powering Off..." ).publish() time.sleep(0.1) with self.get_events_logger().events_logs_by_severity[ Severity.WARNING].open() as events_file: events = [line for line in events_file if 'Powering Off' in line] assert events
def restart(self): # We differentiate between "Restart" and "Reboot". # Restart in AWS will be a Stop and Start of an instance. # When using storage optimized instances like i2 or i3, the data on disk is deleted upon STOP. Therefore, we # need to setup the instance and treat it as a new instance. if self._instance.spot_instance_request_id: LOGGER.debug("target node is spot instance, impossible to stop this instance, skipping the restart") return if self.is_seed: # Due to https://github.com/scylladb/scylla/issues/7588, when we restart a node that is defined as "seed", # we must state a different, living node as the seed provider in the scylla yaml of the restarted node other_nodes = list(set(self.parent_cluster.nodes) - {self}) free_nodes = [node for node in other_nodes if not node.running_nemesis] random_node = random.choice(free_nodes) seed_provider = SeedProvider( class_name="org.apache.cassandra.locator.SimpleSeedProvider", parameters=[{"seeds": f"{random_node.ip_address}"}] ) with self.remote_scylla_yaml() as scylla_yml: scylla_yml.seed_provider = [seed_provider] with ExitStack() as stack: if self.is_data_device_lost_after_reboot: # There is no disk yet, lots of the errors here are acceptable, and we'll ignore them. for db_filter in (DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, node=self), DbEventsFilter(db_event=DatabaseLogEvent.SCHEMA_FAILURE, node=self), DbEventsFilter(db_event=DatabaseLogEvent.NO_SPACE_ERROR, node=self), DbEventsFilter(db_event=DatabaseLogEvent.FILESYSTEM_ERROR, node=self), DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, node=self), ): stack.enter_context(db_filter) self.remoter.sudo(shell_script_cmd(f"""\ sed -e '/.*scylla/s/^/#/g' -i /etc/fstab sed -e '/auto_bootstrap:.*/s/false/true/g' -i /etc/scylla/scylla.yaml if ! grep ^replace_address_first_boot: /etc/scylla/scylla.yaml; then echo 'replace_address_first_boot: {self.ip_address}' | tee --append /etc/scylla/scylla.yaml fi """)) self._instance.stop() self._instance_wait_safe(self._instance.wait_until_stopped) self._instance.start() self._instance_wait_safe(self._instance.wait_until_running) self._wait_public_ip() self.log.debug("Got a new public IP: %s", self._instance.public_ip_address) self.refresh_ip_address() self.wait_ssh_up() if self.is_data_device_lost_after_reboot: self.stop_scylla_server(verify_down=False) # Moving var-lib-scylla.mount away, since scylla_create_devices fails if it already exists mount_path = "/etc/systemd/system/var-lib-scylla.mount" if self.remoter.sudo(f"test -e {mount_path}", ignore_status=True).ok: self.remoter.sudo(f"mv {mount_path} /tmp/") # The scylla_create_devices has been moved to the '/opt/scylladb' folder in the master branch. for create_devices_file in ("/usr/lib/scylla/scylla-ami/scylla_create_devices", "/opt/scylladb/scylla-ami/scylla_create_devices", "/opt/scylladb/scylla-machine-image/scylla_create_devices", ): if self.remoter.sudo(f"test -x {create_devices_file}", ignore_status=True).ok: self.remoter.sudo(create_devices_file) break else: raise IOError("scylla_create_devices file isn't found") self.start_scylla_server(verify_up=False) self.remoter.sudo(shell_script_cmd("""\ sed -e '/auto_bootstrap:.*/s/true/false/g' -i /etc/scylla/scylla.yaml sed -e 's/^replace_address_first_boot:/# replace_address_first_boot:/g' -i /etc/scylla/scylla.yaml """))