def test_backup_rate_limit(self): self.log.info('starting test_backup_rate_limit') if not self.is_cred_file_configured: self.update_config_file() location_list = [f's3:{self.bucket_name}'] manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0]) mgr_cluster = manager_tool.add_cluster(name=self.CLUSTER_NAME + '_rate_limit', db_cluster=self.db_cluster, auth_token=self.monitors.mgmt_auth_token) self.generate_load_and_wait_for_results() rate_limit = ','.join([f'{dc}:{randint(1, 10)}' for dc in self.get_all_dcs_names()]) self.log.info(f'rate limit will be {rate_limit}') backup_task = mgr_cluster.create_backup_task({'location': location_list, 'rate-limit': rate_limit}) task_status = backup_task.wait_and_get_final_status() self.log.info(f'backup task finished with status {task_status}') # TODO: verify that the rate limit is as set in the cmd self.verify_backup_success(mgr_cluster=mgr_cluster, backup_task=backup_task) self.log.info('finishing test_backup_rate_limit')
def test_mgmt_cluster_healthcheck(self): manager_tool = mgmt.get_scylla_manager_tool( manager_node=self.monitors.nodes[0]) selected_host_ip = self._get_cluster_hosts_ip()[0] cluster_name = 'mgr_cluster1' mgr_cluster = manager_tool.get_cluster( cluster_name=cluster_name) or manager_tool.add_cluster( name=cluster_name, db_cluster=self.db_cluster) other_host, other_host_ip = [ host_data for host_data in self._get_cluster_hosts_with_ips() if host_data[1] != selected_host_ip ][0] sleep = 40 self.log.debug( 'Sleep {} seconds, waiting for health-check task to run by schedule on first time' .format(sleep)) time.sleep(sleep) healthcheck_task = mgr_cluster.get_healthcheck_task() self.log.debug("Health-check task history is: {}".format( healthcheck_task.history)) dict_host_health = mgr_cluster.get_hosts_health() for host_health in dict_host_health.values(): assert host_health.status == HostStatus.UP, "Not all hosts status is 'UP'" assert host_health.rest_status == HostRestStatus.UP, "Not all hosts REST status is 'UP'" # Check for sctool status change after scylla-server down other_host.stop_scylla_server() self.log.debug("Health-check next run is: {}".format( healthcheck_task.next_run)) self.log.debug( 'Sleep {} seconds, waiting for health-check task to run after node down' .format(sleep)) time.sleep(sleep) dict_host_health = mgr_cluster.get_hosts_health() assert dict_host_health[ other_host_ip].status == HostStatus.DOWN, "Host: {} status is not 'DOWN'".format( other_host_ip) assert dict_host_health[ other_host_ip].rest_status == HostRestStatus.DOWN, "Host: {} REST status is not 'DOWN'".format( other_host_ip) other_host.start_scylla_server()
def test_repair_multiple_keyspace_types(self): # pylint: disable=invalid-name self.log.info('starting test_repair_multiple_keyspace_types') manager_tool = mgmt.get_scylla_manager_tool( manager_node=self.monitors.nodes[0]) hosts = self.get_cluster_hosts_ip() selected_host = hosts[0] mgr_cluster = manager_tool.get_cluster(cluster_name=self.CLUSTER_NAME) \ or manager_tool.add_cluster(name=self.CLUSTER_NAME, host=selected_host, auth_token=self.monitors.mgmt_auth_token) self._create_keyspace_and_basic_table( self.SIMPLESTRATEGY_KEYSPACE_NAME, "SimpleStrategy") self._create_keyspace_and_basic_table(self.LOCALSTRATEGY_KEYSPACE_NAME, "LocalStrategy") repair_task = mgr_cluster.create_repair_task() task_final_status = repair_task.wait_and_get_final_status(timeout=7200) assert task_final_status == TaskStatus.DONE, 'Task: {} final status is: {}.'.format( repair_task.id, str(repair_task.status)) self.log.info('Task: {} is done.'.format(repair_task.id)) self.log.debug("sctool version is : {}".format(manager_tool.version)) expected_keyspaces_to_be_repaired = [ "system_auth", "system_distributed", "system_traces", # pylint: disable=invalid-name self.SIMPLESTRATEGY_KEYSPACE_NAME ] repair_progress_table = repair_task.detailed_progress self.log.info( "Looking in the repair output for all of the required keyspaces") for keyspace_name in expected_keyspaces_to_be_repaired: keyspace_repair_percentage = self._keyspace_value_in_progress_table( repair_task, repair_progress_table, keyspace_name) assert keyspace_repair_percentage is not None, \ "The keyspace {} was not included in the repair!".format(keyspace_name) assert keyspace_repair_percentage == 100, \ "The repair of the keyspace {} stopped at {}%".format( keyspace_name, keyspace_repair_percentage) localstrategy_keyspace_percentage = self._keyspace_value_in_progress_table( # pylint: disable=invalid-name repair_task, repair_progress_table, self.LOCALSTRATEGY_KEYSPACE_NAME) assert localstrategy_keyspace_percentage is None, \ "The keyspace with the replication strategy of localstrategy was included in repair, when it shouldn't" self.log.info("the sctool repair command was completed successfully") self.log.info('finishing test_repair_multiple_keyspace_types')
def test_basic_backup(self): self.log.info('starting test_basic_backup') if not self.is_cred_file_configured: self.update_config_file() location_list = [f's3:{self.bucket_name}'] manager_tool = mgmt.get_scylla_manager_tool( manager_node=self.monitors.nodes[0]) mgr_cluster = manager_tool.add_cluster( name=self.CLUSTER_NAME + '_basic', db_cluster=self.db_cluster, auth_token=self.monitors.mgmt_auth_token) self.generate_load_and_wait_for_results() backup_task = mgr_cluster.create_backup_task( location_list=location_list) backup_task.wait_for_status(list_status=[TaskStatus.DONE]) self.verify_backup_success(mgr_cluster=mgr_cluster, backup_task=backup_task) self.log.info('finishing test_basic_backup')
def test_client_encryption(self): manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0]) mgr_cluster = manager_tool.add_cluster(name=self.CLUSTER_NAME+"_encryption", db_cluster=self.db_cluster) self._generate_load() repair_task = mgr_cluster.create_repair_task() self.db_cluster.enable_client_encrypt() mgr_cluster.update(client_encrypt=True) repair_task.start(use_continue=True) sleep = 40 self.log.debug('Sleep {} seconds, waiting for health-check task to run by schedule on first time'.format(sleep)) time.sleep(sleep) healthcheck_task = mgr_cluster.get_healthcheck_task() self.log.debug("Health-check task history is: {}".format(healthcheck_task.history)) dict_host_health = mgr_cluster.get_hosts_health() for host_health in dict_host_health.values(): assert host_health.ssl == HostSsl.ON, "Not all hosts ssl is 'ON'" assert host_health.status == HostStatus.UP, "Not all hosts status is 'UP'"
def test_mgmt_cluster_crud(self): """ Test steps: 1) add a cluster to manager. 2) update the cluster attributes in manager: name/host/ssh-user 3) delete the cluster from manager and re-add again. """ self.log.info('starting test_mgmt_cluster_crud') manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0]) hosts = self.get_cluster_hosts_ip() selected_host = hosts[0] cluster_name = 'mgr_cluster_crud' mgr_cluster = manager_tool.add_cluster(name=cluster_name, host=selected_host, auth_token=self.monitors.mgmt_auth_token) # Test cluster attributes cluster_orig_name = mgr_cluster.name mgr_cluster.update(name="{}_renamed".format(cluster_orig_name)) assert mgr_cluster.name == cluster_orig_name+"_renamed", "Cluster name wasn't changed after update command" mgr_cluster.delete() manager_tool.add_cluster(name=cluster_name, host=selected_host, auth_token=self.monitors.mgmt_auth_token) self.log.info('finishing test_mgmt_cluster_crud')
def test_enospc_during_backup(self): self.log.info('starting test_enospc_during_backup') if not self.is_cred_file_configured: self.update_config_file() manager_tool = mgmt.get_scylla_manager_tool( manager_node=self.monitors.nodes[0]) hosts = self.get_cluster_hosts_ip() location_list = [ self.bucket_name, ] selected_host = hosts[0] mgr_cluster = manager_tool.get_cluster(cluster_name=self.CLUSTER_NAME) \ or manager_tool.add_cluster(name=self.CLUSTER_NAME, host=selected_host, auth_token=self.monitors.mgmt_auth_token) target_node = self.db_cluster.nodes[1] self.generate_load_and_wait_for_results() has_enospc_been_reached = False with ignore_no_space_errors(node=target_node): try: backup_task = mgr_cluster.create_backup_task( location_list=location_list) backup_task.wait_for_uploading_stage() backup_task.stop() reach_enospc_on_node(target_node=target_node) has_enospc_been_reached = True backup_task.start() backup_task.wait_and_get_final_status() assert backup_task.status == TaskStatus.DONE, "The backup failed to run on a node with no free space," \ " while it should have had the room for snapshots due " \ "to the previous run" finally: if has_enospc_been_reached: clean_enospc_on_node(target_node=target_node, sleep_time=30)
def test_backup_multiple_ks_tables(self): self.log.info('starting test_backup_multiple_ks_tables') if not self.is_cred_file_configured: self.update_config_file() location_list = [f's3:{self.bucket_name}'] manager_tool = mgmt.get_scylla_manager_tool( manager_node=self.monitors.nodes[0]) mgr_cluster = manager_tool.add_cluster( name=self.CLUSTER_NAME + '_multiple-ks', db_cluster=self.db_cluster, auth_token=self.monitors.mgmt_auth_token) tables = self.create_ks_and_tables(10, 100) self.generate_load_and_wait_for_results() self.log.debug(f'tables list = {tables}') # TODO: insert data to those tables backup_task = mgr_cluster.create_backup_task( location_list=location_list) backup_task.wait_for_status(list_status=[TaskStatus.DONE], timeout=10800) self.verify_backup_success(mgr_cluster=mgr_cluster, backup_task=backup_task) self.log.info('finishing test_backup_multiple_ks_tables')
def upgrade_scylla_manager(pre_upgrade_manager_version, target_upgrade_server_version, target_upgrade_agent_version, manager_node, db_cluster): LOGGER.debug("Stopping manager server") if manager_node.is_docker(): manager_node.remoter.sudo('supervisorctl stop scylla-manager') else: manager_node.remoter.sudo("systemctl stop scylla-manager") LOGGER.debug("Stopping manager agents") for node in db_cluster.nodes: node.remoter.sudo("systemctl stop scylla-manager-agent") LOGGER.debug("Upgrading manager server") manager_node.upgrade_mgmt(target_upgrade_server_version, start_manager_after_upgrade=False) LOGGER.debug("Upgrading and starting manager agents") for node in db_cluster.nodes: node.upgrade_manager_agent(target_upgrade_agent_version) LOGGER.debug("Starting manager server") if manager_node.is_docker(): manager_node.remoter.sudo('supervisorctl start scylla-manager') else: manager_node.remoter.sudo("systemctl start scylla-manager") time_to_sleep = 30 LOGGER.debug( "Sleep %s seconds, waiting for manager service ready to respond", time_to_sleep) sleep(time_to_sleep) LOGGER.debug("Comparing the new manager versions") manager_tool = get_scylla_manager_tool(manager_node=manager_node) new_manager_version = manager_tool.version assert new_manager_version != pre_upgrade_manager_version, "Manager failed to upgrade - " \ "previous and new versions are the same. Test failed!"
def test_manager_upgrade(self): """ Test steps: 1) Run the repair test. 2) Run manager upgrade to new version of yaml: 'scylla_mgmt_upgrade_to_repo'. (the 'from' version is: 'scylla_mgmt_repo'). """ self.log.info('starting test_manager_upgrade') scylla_mgmt_upgrade_to_repo = self.params.get( 'scylla_mgmt_upgrade_to_repo') manager_node = self.monitors.nodes[0] manager_tool = mgmt.get_scylla_manager_tool(manager_node=manager_node) selected_host = self.get_cluster_hosts_ip()[0] cluster_name = 'mgr_cluster1' mgr_cluster = manager_tool.get_cluster(cluster_name=cluster_name) or \ manager_tool.add_cluster(name=cluster_name, host=selected_host, auth_token=self.monitors.mgmt_auth_token) self.log.info('Running some stress and repair before upgrade') self.test_mgmt_repair_nemesis() repair_task_list = mgr_cluster.repair_task_list manager_from_version = manager_tool.version manager_tool.upgrade( scylla_mgmt_upgrade_to_repo=scylla_mgmt_upgrade_to_repo) assert manager_from_version[0] != manager_tool.version[ 0], "Manager version not changed after upgrade." # verify all repair tasks exist for repair_task in repair_task_list: self.log.debug("{} status: {}".format(repair_task.id, repair_task.status)) self.log.info('Running a new repair task after upgrade') repair_task = mgr_cluster.create_repair_task() self.log.debug("{} status: {}".format(repair_task.id, repair_task.status)) self.log.info('finishing test_manager_upgrade')
def test_ssh_setup_script(self): self.log.info('starting test_ssh_setup_script') new_user = "******" new_user_identity_file = os.path.join(mgmt.MANAGER_IDENTITY_FILE_DIR, new_user) + ".pem" manager_tool = mgmt.get_scylla_manager_tool( manager_node=self.monitors.nodes[0]) selected_host_ip = self.get_cluster_hosts_ip()[0] res_ssh_setup, _ssh = manager_tool.scylla_mgr_ssh_setup( node_ip=selected_host_ip, single_node=True, create_user=new_user) self.log.debug('res_ssh_setup: {}'.format(res_ssh_setup)) new_user_login_message = "This account is currently not available" # sudo ssh -i /root/.ssh/qa_user.pem -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -L 59164:0.0.0.0:10000 [email protected] new_user_login_cmd = "sudo ssh -i {} -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -L 59164:0.0.0.0:10000 {}@{}".format( new_user_identity_file, new_user, selected_host_ip) self.log.debug( "new_user_login_cmd command is: {}".format(new_user_login_cmd)) res_new_user_login_cmd = manager_tool.manager_node.remoter.run( new_user_login_cmd, ignore_status=True) self.log.debug( "res_new_user_login_cmd is: {}".format(res_new_user_login_cmd)) assert new_user_login_message in res_new_user_login_cmd.stdout, "unexpected login-returned-message: {} . (expected: {}) ".format( res_new_user_login_cmd.stdout, new_user_login_message) mgr_cluster = manager_tool.add_cluster( name=self.CLUSTER_NAME + "_ssh_setup", host=selected_host_ip, single_node=True, auth_token=self.monitors.mgmt_auth_token) # self.log.debug('mgr_cluster: {}'.format(mgr_cluster)) healthcheck_task = mgr_cluster.get_healthcheck_task() self.log.debug("Health-check task history is: {}".format( healthcheck_task.history)) dict_host_health = mgr_cluster.get_hosts_health() for host_health in dict_host_health.values(): self.log.debug("host_health is: {}".format(host_health)) self.log.info('finishing test_ssh_setup_script')
def test_mgmt_cluster_crud(self): """ Test steps: 1) add a cluster to manager. 2) update the cluster attributes in manager: name/host/ssh-user 3) delete the cluster from manager and re-add again. """ manager_tool = mgmt.get_scylla_manager_tool( manager_node=self.monitors.nodes[0]) hosts = self._get_cluster_hosts_ip() selected_host = hosts[0] cluster_name = 'mgr_cluster1' mgr_cluster = manager_tool.get_cluster( cluster_name=cluster_name) or manager_tool.add_cluster( name=cluster_name, host=selected_host) # Test cluster attributes cluster_orig_name = mgr_cluster.name mgr_cluster.update(name="{}_renamed".format(cluster_orig_name)) assert mgr_cluster.name == cluster_orig_name + "_renamed", "Cluster name wasn't changed after update command" origin_ssh_user = mgr_cluster.ssh_user origin_rsa_id = self.MANAGER_IDENTITY_FILE new_ssh_user = "******" new_rsa_id = '/tmp/scylla-test' mgr_cluster.update(ssh_user=new_ssh_user, ssh_identity_file=new_rsa_id) assert mgr_cluster.ssh_user == new_ssh_user, "Cluster ssh-user wasn't changed after update command" mgr_cluster.update(ssh_user=origin_ssh_user, ssh_identity_file=origin_rsa_id) mgr_cluster.delete() mgr_cluster2 = manager_tool.add_cluster(name=cluster_name, host=selected_host)
def test_client_encryption(self): self.log.info('starting test_client_encryption') manager_tool = mgmt.get_scylla_manager_tool( manager_node=self.monitors.nodes[0]) mgr_cluster = manager_tool.add_cluster( name=self.CLUSTER_NAME + "_encryption", db_cluster=self.db_cluster, auth_token=self.monitors.mgmt_auth_token) self.generate_load_and_wait_for_results() repair_task = mgr_cluster.create_repair_task(fail_fast=True) dict_host_health = mgr_cluster.get_hosts_health() for host_health in dict_host_health.values(): assert host_health.ssl == HostSsl.OFF, "Not all hosts ssl is 'OFF'" with DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="failed to do checksum for"), \ DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="failed to do checksum for"), \ DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="Reactor stalled"), \ DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="get_repair_meta: repair_meta_id"): self.db_cluster.enable_client_encrypt() mgr_cluster.update(client_encrypt=True) repair_task.start() sleep = 40 self.log.debug( 'Sleep {} seconds, waiting for health-check task to run by schedule on first time' .format(sleep)) time.sleep(sleep) healthcheck_task = mgr_cluster.get_healthcheck_task() self.log.debug("Health-check task history is: {}".format( healthcheck_task.history)) dict_host_health = mgr_cluster.get_hosts_health() for host_health in dict_host_health.values(): assert host_health.ssl == HostSsl.ON, "Not all hosts ssl is 'ON'" assert host_health.status == HostStatus.UP, "Not all hosts status is 'UP'" self.log.info('finishing test_client_encryption')
def test_upgrade(self): # pylint: disable=too-many-locals,too-many-statements target_upgrade_server_version = self.params.get( 'target_scylla_mgmt_server_repo') target_upgrade_agent_version = self.params.get( 'target_scylla_mgmt_agent_repo') manager_node = self.monitors.nodes[0] new_manager_http_port = 12345 with manager_node.remote_manager_yaml() as scylla_manager_yaml: node_ip = scylla_manager_yaml["http"].split(":", maxsplit=1)[0] scylla_manager_yaml["http"] = f"{node_ip}:{new_manager_http_port}" scylla_manager_yaml[ "prometheus"] = f"{node_ip}:{self.params['manager_prometheus_port']}" LOGGER.info( "The new Scylla Manager is:\n{}".format(scylla_manager_yaml)) manager_node.remoter.sudo("systemctl restart scylla-manager") manager_node.wait_manager_server_up(port=new_manager_http_port) manager_tool = get_scylla_manager_tool(manager_node=manager_node) manager_tool.add_cluster(name="cluster_under_test", db_cluster=self.db_cluster, auth_token=self.monitors.mgmt_auth_token) current_manager_version = manager_tool.version LOGGER.debug("Generating load") self.generate_load_and_wait_for_results() mgr_cluster = manager_tool.get_cluster( cluster_name="cluster_under_test") with self.subTest("Creating reoccurring backup and repair tasks"): repair_task = mgr_cluster.create_repair_task(interval="1d") repair_task_current_details = wait_until_task_finishes_return_details( repair_task) backup_task = mgr_cluster.create_backup_task( interval="1d", location_list=self.locations, keyspace_list=["keyspace1"]) backup_task_current_details = wait_until_task_finishes_return_details( backup_task) backup_task_snapshot = backup_task.get_snapshot_tag() pre_upgrade_backup_task_files = mgr_cluster.get_backup_files_dict( backup_task_snapshot) with self.subTest( "Creating a simple backup with the intention of purging it"): self.create_simple_table(table_name="cf1") self.write_multiple_rows(table_name="cf1", key_range=(1, 11)) self.create_simple_table(table_name="cf2") self.write_multiple_rows(table_name="cf2", key_range=(1, 11)) rerunning_backup_task = \ mgr_cluster.create_backup_task(location_list=self.locations, keyspace_list=["ks1"], retention=2) rerunning_backup_task.wait_and_get_final_status(timeout=300, step=20) assert rerunning_backup_task.status == TaskStatus.DONE, \ f"Unknown failure in task {rerunning_backup_task.id}" with self.subTest("Creating a backup task and stopping it"): legacy_args = "--force" if manager_tool.client_version.startswith( "2.1") else None pausable_backup_task = mgr_cluster.create_backup_task( interval="1d", location_list=self.locations, keyspace_list=["system_*"], legacy_args=legacy_args, ) pausable_backup_task.wait_for_status( list_status=[TaskStatus.RUNNING], timeout=180, step=2) pausable_backup_task.stop() upgrade_scylla_manager( pre_upgrade_manager_version=current_manager_version, target_upgrade_server_version=target_upgrade_server_version, target_upgrade_agent_version=target_upgrade_agent_version, manager_node=manager_node, db_cluster=self.db_cluster) LOGGER.debug( "Checking that the previously created tasks' details have not changed" ) manager_tool = get_scylla_manager_tool(manager_node=manager_node) # make sure that the cluster is still added to the manager manager_tool.get_cluster(cluster_name="cluster_under_test") validate_previous_task_details( task=repair_task, previous_task_details=repair_task_current_details) validate_previous_task_details( task=backup_task, previous_task_details=backup_task_current_details) with self.subTest( "Continuing a older version stopped backup task with newer version manager" ): pausable_backup_task.start() pausable_backup_task.wait_and_get_final_status(timeout=1200, step=20) assert pausable_backup_task.status == TaskStatus.DONE, \ f"task {pausable_backup_task.id} failed to continue after manager upgrade" with self.subTest( "Restoring an older version backup task with newer version manager" ): self.verify_backup_success(mgr_cluster=mgr_cluster, backup_task=backup_task) with self.subTest( "Executing the 'backup list' and 'backup files' commands on a older version backup" " with newer version manager"): current_backup_files = mgr_cluster.get_backup_files_dict( backup_task_snapshot) assert pre_upgrade_backup_task_files == current_backup_files,\ f"Backup task of the task {backup_task.id} is not identical after the manager upgrade:" \ f"\nbefore the upgrade:\n{pre_upgrade_backup_task_files}\nafter the upgrade:\n{current_backup_files}" mgr_cluster.sctool.run(cmd=f" backup list -c {mgr_cluster.id}", is_verify_errorless_result=True) with self.subTest("purging a older version backup"): # Dropping one table with self.db_cluster.cql_connection_patient( self.db_cluster.nodes[0]) as session: session.execute("DROP TABLE ks1.cf1 ;") for i in range(2, 4): LOGGER.debug("rerunning the backup task for the %s time", i) rerunning_backup_task.start(continue_task=False) rerunning_backup_task.wait_and_get_final_status(step=5) assert rerunning_backup_task.status == TaskStatus.DONE, \ f"backup {rerunning_backup_task.id} that was rerun again from the start has failed to reach " \ f"status DONE within expected time limit" per_node_backup_file_paths = mgr_cluster.get_backup_files_dict( snapshot_tag=rerunning_backup_task.get_snapshot_tag()) for node in self.db_cluster.nodes: node_id = node.host_id # making sure that the files of the missing table isn't in s3 assert "cf1" not in per_node_backup_file_paths[node_id]["ks1"], \ "The missing table is still in s3, even though it should have been purged"
def test_upgrade(self): # pylint: disable=too-many-locals,too-many-statements target_upgrade_server_version = self.params.get( 'target_scylla_mgmt_server_repo') target_upgrade_agent_version = self.params.get( 'target_scylla_mgmt_agent_repo') manager_node = self.monitors.nodes[0] manager_tool = get_scylla_manager_tool(manager_node=manager_node) manager_tool.add_cluster(name="cluster_under_test", db_cluster=self.db_cluster, auth_token=self.monitors.mgmt_auth_token) current_manager_version = manager_tool.version LOGGER.debug("Generating load") self.generate_load_and_wait_for_results() mgr_cluster = manager_tool.get_cluster( cluster_name="cluster_under_test") with self.subTest("Creating reoccurring backup and repair tasks"): repair_task = mgr_cluster.create_repair_task(interval="1d") repair_task_current_details = wait_until_task_finishes_return_details( repair_task) self.update_all_agent_config_files() bucket_name = self.params.get('backup_bucket_location').split()[0] location_list = [f's3:{bucket_name}'] backup_task = mgr_cluster.create_backup_task( interval="1d", location_list=location_list, keyspace_list=["keyspace1"]) backup_task_current_details = wait_until_task_finishes_return_details( backup_task) backup_task_snapshot = backup_task.get_snapshot_tag() pre_upgrade_backup_task_files = mgr_cluster.get_backup_files_dict( backup_task_snapshot) with self.subTest("Creating a backup task and stopping it"): pausable_backup_task = mgr_cluster.create_backup_task( interval="1d", location_list=location_list, keyspace_list=["system_*"]) pausable_backup_task.wait_for_status( list_status=[TaskStatus.RUNNING], timeout=180, step=2) pausable_backup_task.stop() with self.subTest( "Creating a simple backup with the intention of purging it"): self.create_simple_table(table_name="cf1") self.write_multiple_rows(table_name="cf1", key_range=(1, 11)) self.create_simple_table(table_name="cf2") self.write_multiple_rows(table_name="cf2", key_range=(1, 11)) bucket_name = self.params.get('backup_bucket_location').split()[0] location_list = [f's3:{bucket_name}'] rerunning_backup_task = mgr_cluster.create_backup_task( location_list=location_list, keyspace_list=["ks1"], retention=2) rerunning_backup_task.wait_and_get_final_status(timeout=300, step=20) assert rerunning_backup_task.status == TaskStatus.DONE, \ f"Unknown failure in task {rerunning_backup_task.id}" upgrade_scylla_manager( pre_upgrade_manager_version=current_manager_version, target_upgrade_server_version=target_upgrade_server_version, target_upgrade_agent_version=target_upgrade_agent_version, manager_node=manager_node, db_cluster=self.db_cluster) LOGGER.debug( "Checking that the previously created tasks' details have not changed" ) manager_tool = get_scylla_manager_tool(manager_node=manager_node) # make sure that the cluster is still added to the manager manager_tool.get_cluster(cluster_name="cluster_under_test") validate_previous_task_details( task=repair_task, previous_task_details=repair_task_current_details) validate_previous_task_details( task=backup_task, previous_task_details=backup_task_current_details) with self.subTest("Restoring a 2.0 backup task with 2.1 manager"): self.verify_backup_success(mgr_cluster=mgr_cluster, backup_task=backup_task) with self.subTest( "Continuing a 2.0 stopped backup task with 2.1 manager"): pausable_backup_task.start() pausable_backup_task.wait_and_get_final_status(timeout=1200, step=20) assert pausable_backup_task.status == TaskStatus.DONE, \ f"task {pausable_backup_task.id} failed to continue after manager upgrade" with self.subTest( "Executing the 'backup list' and 'backup files' commands on a 2.0 backup with 2.1 manager" ): current_backup_files = mgr_cluster.get_backup_files_dict( backup_task_snapshot) assert pre_upgrade_backup_task_files == current_backup_files,\ f"Backup task of the task {backup_task.id} is not identical after the manager upgrade:" \ f"\nbefore the upgrade:\n{pre_upgrade_backup_task_files}\nafter the upgrade:\n{current_backup_files}" mgr_cluster.sctool.run(cmd=f" backup list -c {mgr_cluster.id}", is_verify_errorless_result=True) with self.subTest("purging a 2.0 backup"): # Dropping one table with self.db_cluster.cql_connection_patient( self.db_cluster.nodes[0]) as session: session.execute("DROP TABLE ks1.cf1 ;") for i in range(2, 4): LOGGER.debug(f"rerunning the backup task for the {i} time") rerunning_backup_task.start(continue_task=False) per_node_backup_file_paths = mgr_cluster.get_backup_files_dict( snapshot_tag=rerunning_backup_task.get_snapshot_tag()) for node in self.db_cluster.nodes: nodetool_info = self.db_cluster.get_nodetool_info(node) node_id = nodetool_info['ID'] # making sure that the files of the missing table isn't in s3 assert "cf1" not in per_node_backup_file_paths[node_id]["ks1"], \ "The missing table is still in s3, even though it should have been purged"
def test_intensity_and_parallel(self, fault_multiple_nodes): keyspace_to_be_repaired = "keyspace2" InfoEvent(message='starting test_intensity_and_parallel').publish() if not self.is_cred_file_configured: self.update_config_file() manager_tool = mgmt.get_scylla_manager_tool( manager_node=self.monitors.nodes[0]) mgr_cluster = manager_tool.add_cluster( name=self.CLUSTER_NAME + '_intensity_and_parallel', db_cluster=self.db_cluster, auth_token=self.monitors.mgmt_auth_token, ) InfoEvent(message="Starting faulty load (to be repaired)").publish() self.create_missing_rows_in_cluster( create_missing_rows_in_multiple_nodes=fault_multiple_nodes, keyspace_to_be_repaired=keyspace_to_be_repaired) InfoEvent(message="Starting a repair with no intensity").publish() base_repair_task = mgr_cluster.create_repair_task(keyspace="keyspace*") base_repair_task.wait_and_get_final_status(step=30) assert base_repair_task.status == TaskStatus.DONE, "The base repair task did not end in the expected time" InfoEvent( message= f"The base repair, with no intensity argument, took {base_repair_task.duration}" ).publish() with self.db_cluster.cql_connection_patient( self.db_cluster.nodes[0]) as session: session.execute( f"DROP KEYSPACE IF EXISTS {keyspace_to_be_repaired}") arg_list = [{ "intensity": .5 }, { "intensity": .25 }, { "intensity": .0001 }, { "intensity": 2 }, { "intensity": 4 }, { "parallel": 1 }, { "parallel": 2 }, { "intensity": 2, "parallel": 1 }, { "intensity": 100 }, { "intensity": 0 }] for arg_dict in arg_list: InfoEvent( message="Starting faulty load (to be repaired)").publish() self.create_missing_rows_in_cluster( create_missing_rows_in_multiple_nodes=fault_multiple_nodes, keyspace_to_be_repaired=keyspace_to_be_repaired) InfoEvent(message=f"Starting a repair with {arg_dict}").publish() repair_task = mgr_cluster.create_repair_task(**arg_dict, keyspace="keyspace*") repair_task.wait_and_get_final_status(step=30) InfoEvent( message=f"repair with {arg_dict} took {repair_task.duration}" ).publish() with self.db_cluster.cql_connection_patient( self.db_cluster.nodes[0]) as session: session.execute( f"DROP KEYSPACE IF EXISTS {keyspace_to_be_repaired}") InfoEvent(message='finishing test_intensity_and_parallel').publish()