def test_backup_rate_limit(self):
     self.log.info('starting test_backup_rate_limit')
     if not self.is_cred_file_configured:
         self.update_config_file()
     location_list = [f's3:{self.bucket_name}']
     manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0])
     mgr_cluster = manager_tool.add_cluster(name=self.CLUSTER_NAME + '_rate_limit', db_cluster=self.db_cluster,
                                            auth_token=self.monitors.mgmt_auth_token)
     self.generate_load_and_wait_for_results()
     rate_limit = ','.join([f'{dc}:{randint(1, 10)}' for dc in self.get_all_dcs_names()])
     self.log.info(f'rate limit will be {rate_limit}')
     backup_task = mgr_cluster.create_backup_task({'location': location_list, 'rate-limit': rate_limit})
     task_status = backup_task.wait_and_get_final_status()
     self.log.info(f'backup task finished with status {task_status}')
     # TODO: verify that the rate limit is as set in the cmd
     self.verify_backup_success(mgr_cluster=mgr_cluster, backup_task=backup_task)
     self.log.info('finishing test_backup_rate_limit')
Example #2
0
    def test_mgmt_cluster_healthcheck(self):

        manager_tool = mgmt.get_scylla_manager_tool(
            manager_node=self.monitors.nodes[0])
        selected_host_ip = self._get_cluster_hosts_ip()[0]
        cluster_name = 'mgr_cluster1'
        mgr_cluster = manager_tool.get_cluster(
            cluster_name=cluster_name) or manager_tool.add_cluster(
                name=cluster_name, db_cluster=self.db_cluster)
        other_host, other_host_ip = [
            host_data for host_data in self._get_cluster_hosts_with_ips()
            if host_data[1] != selected_host_ip
        ][0]

        sleep = 40
        self.log.debug(
            'Sleep {} seconds, waiting for health-check task to run by schedule on first time'
            .format(sleep))
        time.sleep(sleep)

        healthcheck_task = mgr_cluster.get_healthcheck_task()
        self.log.debug("Health-check task history is: {}".format(
            healthcheck_task.history))
        dict_host_health = mgr_cluster.get_hosts_health()
        for host_health in dict_host_health.values():
            assert host_health.status == HostStatus.UP, "Not all hosts status is 'UP'"
            assert host_health.rest_status == HostRestStatus.UP, "Not all hosts REST status is 'UP'"

        # Check for sctool status change after scylla-server down
        other_host.stop_scylla_server()
        self.log.debug("Health-check next run is: {}".format(
            healthcheck_task.next_run))
        self.log.debug(
            'Sleep {} seconds, waiting for health-check task to run after node down'
            .format(sleep))
        time.sleep(sleep)

        dict_host_health = mgr_cluster.get_hosts_health()
        assert dict_host_health[
            other_host_ip].status == HostStatus.DOWN, "Host: {} status is not 'DOWN'".format(
                other_host_ip)
        assert dict_host_health[
            other_host_ip].rest_status == HostRestStatus.DOWN, "Host: {} REST status is not 'DOWN'".format(
                other_host_ip)

        other_host.start_scylla_server()
Example #3
0
    def test_repair_multiple_keyspace_types(self):  # pylint: disable=invalid-name
        self.log.info('starting test_repair_multiple_keyspace_types')
        manager_tool = mgmt.get_scylla_manager_tool(
            manager_node=self.monitors.nodes[0])
        hosts = self.get_cluster_hosts_ip()
        selected_host = hosts[0]
        mgr_cluster = manager_tool.get_cluster(cluster_name=self.CLUSTER_NAME) \
            or manager_tool.add_cluster(name=self.CLUSTER_NAME, host=selected_host,
                                        auth_token=self.monitors.mgmt_auth_token)
        self._create_keyspace_and_basic_table(
            self.SIMPLESTRATEGY_KEYSPACE_NAME, "SimpleStrategy")
        self._create_keyspace_and_basic_table(self.LOCALSTRATEGY_KEYSPACE_NAME,
                                              "LocalStrategy")
        repair_task = mgr_cluster.create_repair_task()
        task_final_status = repair_task.wait_and_get_final_status(timeout=7200)
        assert task_final_status == TaskStatus.DONE, 'Task: {} final status is: {}.'.format(
            repair_task.id, str(repair_task.status))
        self.log.info('Task: {} is done.'.format(repair_task.id))
        self.log.debug("sctool version is : {}".format(manager_tool.version))

        expected_keyspaces_to_be_repaired = [
            "system_auth",
            "system_distributed",
            "system_traces",  # pylint: disable=invalid-name
            self.SIMPLESTRATEGY_KEYSPACE_NAME
        ]
        repair_progress_table = repair_task.detailed_progress
        self.log.info(
            "Looking in the repair output for all of the required keyspaces")
        for keyspace_name in expected_keyspaces_to_be_repaired:
            keyspace_repair_percentage = self._keyspace_value_in_progress_table(
                repair_task, repair_progress_table, keyspace_name)
            assert keyspace_repair_percentage is not None, \
                "The keyspace {} was not included in the repair!".format(keyspace_name)
            assert keyspace_repair_percentage == 100, \
                "The repair of the keyspace {} stopped at {}%".format(
                    keyspace_name, keyspace_repair_percentage)

        localstrategy_keyspace_percentage = self._keyspace_value_in_progress_table(  # pylint: disable=invalid-name
            repair_task, repair_progress_table,
            self.LOCALSTRATEGY_KEYSPACE_NAME)
        assert localstrategy_keyspace_percentage is None, \
            "The keyspace with the replication strategy of localstrategy was included in repair, when it shouldn't"
        self.log.info("the sctool repair command was completed successfully")
        self.log.info('finishing test_repair_multiple_keyspace_types')
Example #4
0
 def test_basic_backup(self):
     self.log.info('starting test_basic_backup')
     if not self.is_cred_file_configured:
         self.update_config_file()
     location_list = [f's3:{self.bucket_name}']
     manager_tool = mgmt.get_scylla_manager_tool(
         manager_node=self.monitors.nodes[0])
     mgr_cluster = manager_tool.add_cluster(
         name=self.CLUSTER_NAME + '_basic',
         db_cluster=self.db_cluster,
         auth_token=self.monitors.mgmt_auth_token)
     self.generate_load_and_wait_for_results()
     backup_task = mgr_cluster.create_backup_task(
         location_list=location_list)
     backup_task.wait_for_status(list_status=[TaskStatus.DONE])
     self.verify_backup_success(mgr_cluster=mgr_cluster,
                                backup_task=backup_task)
     self.log.info('finishing test_basic_backup')
    def test_client_encryption(self):
        manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0])
        mgr_cluster = manager_tool.add_cluster(name=self.CLUSTER_NAME+"_encryption", db_cluster=self.db_cluster)
        self._generate_load()
        repair_task = mgr_cluster.create_repair_task()
        self.db_cluster.enable_client_encrypt()
        mgr_cluster.update(client_encrypt=True)
        repair_task.start(use_continue=True)
        sleep = 40
        self.log.debug('Sleep {} seconds, waiting for health-check task to run by schedule on first time'.format(sleep))
        time.sleep(sleep)

        healthcheck_task = mgr_cluster.get_healthcheck_task()
        self.log.debug("Health-check task history is: {}".format(healthcheck_task.history))
        dict_host_health = mgr_cluster.get_hosts_health()
        for host_health in dict_host_health.values():
            assert host_health.ssl == HostSsl.ON, "Not all hosts ssl is 'ON'"
            assert host_health.status == HostStatus.UP, "Not all hosts status is 'UP'"
 def test_mgmt_cluster_crud(self):
     """
     Test steps:
     1) add a cluster to manager.
     2) update the cluster attributes in manager: name/host/ssh-user
     3) delete the cluster from manager and re-add again.
     """
     self.log.info('starting test_mgmt_cluster_crud')
     manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0])
     hosts = self.get_cluster_hosts_ip()
     selected_host = hosts[0]
     cluster_name = 'mgr_cluster_crud'
     mgr_cluster = manager_tool.add_cluster(name=cluster_name, host=selected_host,
                                            auth_token=self.monitors.mgmt_auth_token)
     # Test cluster attributes
     cluster_orig_name = mgr_cluster.name
     mgr_cluster.update(name="{}_renamed".format(cluster_orig_name))
     assert mgr_cluster.name == cluster_orig_name+"_renamed", "Cluster name wasn't changed after update command"
     mgr_cluster.delete()
     manager_tool.add_cluster(name=cluster_name, host=selected_host, auth_token=self.monitors.mgmt_auth_token)
     self.log.info('finishing test_mgmt_cluster_crud')
    def test_enospc_during_backup(self):
        self.log.info('starting test_enospc_during_backup')
        if not self.is_cred_file_configured:
            self.update_config_file()
        manager_tool = mgmt.get_scylla_manager_tool(
            manager_node=self.monitors.nodes[0])
        hosts = self.get_cluster_hosts_ip()
        location_list = [
            self.bucket_name,
        ]
        selected_host = hosts[0]
        mgr_cluster = manager_tool.get_cluster(cluster_name=self.CLUSTER_NAME) \
            or manager_tool.add_cluster(name=self.CLUSTER_NAME, host=selected_host,
                                        auth_token=self.monitors.mgmt_auth_token)

        target_node = self.db_cluster.nodes[1]

        self.generate_load_and_wait_for_results()
        has_enospc_been_reached = False
        with ignore_no_space_errors(node=target_node):
            try:
                backup_task = mgr_cluster.create_backup_task(
                    location_list=location_list)
                backup_task.wait_for_uploading_stage()
                backup_task.stop()

                reach_enospc_on_node(target_node=target_node)
                has_enospc_been_reached = True

                backup_task.start()

                backup_task.wait_and_get_final_status()
                assert backup_task.status == TaskStatus.DONE, "The backup failed to run on a node with no free space," \
                                                              " while it should have had the room for snapshots due " \
                                                              "to the previous run"

            finally:
                if has_enospc_been_reached:
                    clean_enospc_on_node(target_node=target_node,
                                         sleep_time=30)
Example #8
0
 def test_backup_multiple_ks_tables(self):
     self.log.info('starting test_backup_multiple_ks_tables')
     if not self.is_cred_file_configured:
         self.update_config_file()
     location_list = [f's3:{self.bucket_name}']
     manager_tool = mgmt.get_scylla_manager_tool(
         manager_node=self.monitors.nodes[0])
     mgr_cluster = manager_tool.add_cluster(
         name=self.CLUSTER_NAME + '_multiple-ks',
         db_cluster=self.db_cluster,
         auth_token=self.monitors.mgmt_auth_token)
     tables = self.create_ks_and_tables(10, 100)
     self.generate_load_and_wait_for_results()
     self.log.debug(f'tables list = {tables}')
     # TODO: insert data to those tables
     backup_task = mgr_cluster.create_backup_task(
         location_list=location_list)
     backup_task.wait_for_status(list_status=[TaskStatus.DONE],
                                 timeout=10800)
     self.verify_backup_success(mgr_cluster=mgr_cluster,
                                backup_task=backup_task)
     self.log.info('finishing test_backup_multiple_ks_tables')
Example #9
0
def upgrade_scylla_manager(pre_upgrade_manager_version,
                           target_upgrade_server_version,
                           target_upgrade_agent_version, manager_node,
                           db_cluster):
    LOGGER.debug("Stopping manager server")
    if manager_node.is_docker():
        manager_node.remoter.sudo('supervisorctl stop scylla-manager')
    else:
        manager_node.remoter.sudo("systemctl stop scylla-manager")

    LOGGER.debug("Stopping manager agents")
    for node in db_cluster.nodes:
        node.remoter.sudo("systemctl stop scylla-manager-agent")

    LOGGER.debug("Upgrading manager server")
    manager_node.upgrade_mgmt(target_upgrade_server_version,
                              start_manager_after_upgrade=False)

    LOGGER.debug("Upgrading and starting manager agents")
    for node in db_cluster.nodes:
        node.upgrade_manager_agent(target_upgrade_agent_version)

    LOGGER.debug("Starting manager server")
    if manager_node.is_docker():
        manager_node.remoter.sudo('supervisorctl start scylla-manager')
    else:
        manager_node.remoter.sudo("systemctl start scylla-manager")
    time_to_sleep = 30
    LOGGER.debug(
        "Sleep %s seconds, waiting for manager service ready to respond",
        time_to_sleep)
    sleep(time_to_sleep)

    LOGGER.debug("Comparing the new manager versions")
    manager_tool = get_scylla_manager_tool(manager_node=manager_node)
    new_manager_version = manager_tool.version
    assert new_manager_version != pre_upgrade_manager_version, "Manager failed to upgrade - " \
                                                               "previous and new versions are the same. Test failed!"
Example #10
0
    def test_manager_upgrade(self):
        """
        Test steps:
        1) Run the repair test.
        2) Run manager upgrade to new version of yaml: 'scylla_mgmt_upgrade_to_repo'. (the 'from' version is: 'scylla_mgmt_repo').
        """
        self.log.info('starting test_manager_upgrade')
        scylla_mgmt_upgrade_to_repo = self.params.get(
            'scylla_mgmt_upgrade_to_repo')
        manager_node = self.monitors.nodes[0]
        manager_tool = mgmt.get_scylla_manager_tool(manager_node=manager_node)
        selected_host = self.get_cluster_hosts_ip()[0]
        cluster_name = 'mgr_cluster1'
        mgr_cluster = manager_tool.get_cluster(cluster_name=cluster_name) or \
            manager_tool.add_cluster(name=cluster_name, host=selected_host,
                                     auth_token=self.monitors.mgmt_auth_token)
        self.log.info('Running some stress and repair before upgrade')
        self.test_mgmt_repair_nemesis()

        repair_task_list = mgr_cluster.repair_task_list

        manager_from_version = manager_tool.version
        manager_tool.upgrade(
            scylla_mgmt_upgrade_to_repo=scylla_mgmt_upgrade_to_repo)

        assert manager_from_version[0] != manager_tool.version[
            0], "Manager version not changed after upgrade."
        # verify all repair tasks exist
        for repair_task in repair_task_list:
            self.log.debug("{} status: {}".format(repair_task.id,
                                                  repair_task.status))

        self.log.info('Running a new repair task after upgrade')
        repair_task = mgr_cluster.create_repair_task()
        self.log.debug("{} status: {}".format(repair_task.id,
                                              repair_task.status))
        self.log.info('finishing test_manager_upgrade')
Example #11
0
    def test_ssh_setup_script(self):
        self.log.info('starting test_ssh_setup_script')
        new_user = "******"
        new_user_identity_file = os.path.join(mgmt.MANAGER_IDENTITY_FILE_DIR,
                                              new_user) + ".pem"
        manager_tool = mgmt.get_scylla_manager_tool(
            manager_node=self.monitors.nodes[0])
        selected_host_ip = self.get_cluster_hosts_ip()[0]
        res_ssh_setup, _ssh = manager_tool.scylla_mgr_ssh_setup(
            node_ip=selected_host_ip, single_node=True, create_user=new_user)
        self.log.debug('res_ssh_setup: {}'.format(res_ssh_setup))
        new_user_login_message = "This account is currently not available"
        # sudo ssh -i /root/.ssh/qa_user.pem -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -L 59164:0.0.0.0:10000 [email protected]
        new_user_login_cmd = "sudo ssh -i {} -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -L 59164:0.0.0.0:10000 {}@{}".format(
            new_user_identity_file, new_user, selected_host_ip)
        self.log.debug(
            "new_user_login_cmd command is: {}".format(new_user_login_cmd))
        res_new_user_login_cmd = manager_tool.manager_node.remoter.run(
            new_user_login_cmd, ignore_status=True)
        self.log.debug(
            "res_new_user_login_cmd is: {}".format(res_new_user_login_cmd))
        assert new_user_login_message in res_new_user_login_cmd.stdout, "unexpected login-returned-message: {} . (expected: {}) ".format(
            res_new_user_login_cmd.stdout, new_user_login_message)

        mgr_cluster = manager_tool.add_cluster(
            name=self.CLUSTER_NAME + "_ssh_setup",
            host=selected_host_ip,
            single_node=True,
            auth_token=self.monitors.mgmt_auth_token)
        # self.log.debug('mgr_cluster: {}'.format(mgr_cluster))
        healthcheck_task = mgr_cluster.get_healthcheck_task()
        self.log.debug("Health-check task history is: {}".format(
            healthcheck_task.history))
        dict_host_health = mgr_cluster.get_hosts_health()
        for host_health in dict_host_health.values():
            self.log.debug("host_health is: {}".format(host_health))
        self.log.info('finishing test_ssh_setup_script')
    def test_mgmt_cluster_crud(self):
        """

        Test steps:
        1) add a cluster to manager.
        2) update the cluster attributes in manager: name/host/ssh-user
        3) delete the cluster from manager and re-add again.
        """

        manager_tool = mgmt.get_scylla_manager_tool(
            manager_node=self.monitors.nodes[0])
        hosts = self._get_cluster_hosts_ip()
        selected_host = hosts[0]
        cluster_name = 'mgr_cluster1'
        mgr_cluster = manager_tool.get_cluster(
            cluster_name=cluster_name) or manager_tool.add_cluster(
                name=cluster_name, host=selected_host)

        # Test cluster attributes
        cluster_orig_name = mgr_cluster.name
        mgr_cluster.update(name="{}_renamed".format(cluster_orig_name))
        assert mgr_cluster.name == cluster_orig_name + "_renamed", "Cluster name wasn't changed after update command"

        origin_ssh_user = mgr_cluster.ssh_user
        origin_rsa_id = self.MANAGER_IDENTITY_FILE
        new_ssh_user = "******"
        new_rsa_id = '/tmp/scylla-test'

        mgr_cluster.update(ssh_user=new_ssh_user, ssh_identity_file=new_rsa_id)
        assert mgr_cluster.ssh_user == new_ssh_user, "Cluster ssh-user wasn't changed after update command"

        mgr_cluster.update(ssh_user=origin_ssh_user,
                           ssh_identity_file=origin_rsa_id)
        mgr_cluster.delete()
        mgr_cluster2 = manager_tool.add_cluster(name=cluster_name,
                                                host=selected_host)
    def test_client_encryption(self):
        self.log.info('starting test_client_encryption')
        manager_tool = mgmt.get_scylla_manager_tool(
            manager_node=self.monitors.nodes[0])
        mgr_cluster = manager_tool.add_cluster(
            name=self.CLUSTER_NAME + "_encryption",
            db_cluster=self.db_cluster,
            auth_token=self.monitors.mgmt_auth_token)
        self.generate_load_and_wait_for_results()
        repair_task = mgr_cluster.create_repair_task(fail_fast=True)
        dict_host_health = mgr_cluster.get_hosts_health()
        for host_health in dict_host_health.values():
            assert host_health.ssl == HostSsl.OFF, "Not all hosts ssl is 'OFF'"

        with DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="failed to do checksum for"), \
                DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="failed to do checksum for"), \
                DbEventsFilter(db_event=DatabaseLogEvent.DATABASE_ERROR, line="Reactor stalled"), \
                DbEventsFilter(db_event=DatabaseLogEvent.RUNTIME_ERROR, line="get_repair_meta: repair_meta_id"):

            self.db_cluster.enable_client_encrypt()

        mgr_cluster.update(client_encrypt=True)
        repair_task.start()
        sleep = 40
        self.log.debug(
            'Sleep {} seconds, waiting for health-check task to run by schedule on first time'
            .format(sleep))
        time.sleep(sleep)
        healthcheck_task = mgr_cluster.get_healthcheck_task()
        self.log.debug("Health-check task history is: {}".format(
            healthcheck_task.history))
        dict_host_health = mgr_cluster.get_hosts_health()
        for host_health in dict_host_health.values():
            assert host_health.ssl == HostSsl.ON, "Not all hosts ssl is 'ON'"
            assert host_health.status == HostStatus.UP, "Not all hosts status is 'UP'"
        self.log.info('finishing test_client_encryption')
Example #14
0
    def test_upgrade(self):  # pylint: disable=too-many-locals,too-many-statements
        target_upgrade_server_version = self.params.get(
            'target_scylla_mgmt_server_repo')
        target_upgrade_agent_version = self.params.get(
            'target_scylla_mgmt_agent_repo')
        manager_node = self.monitors.nodes[0]

        new_manager_http_port = 12345
        with manager_node.remote_manager_yaml() as scylla_manager_yaml:
            node_ip = scylla_manager_yaml["http"].split(":", maxsplit=1)[0]
            scylla_manager_yaml["http"] = f"{node_ip}:{new_manager_http_port}"
            scylla_manager_yaml[
                "prometheus"] = f"{node_ip}:{self.params['manager_prometheus_port']}"
            LOGGER.info(
                "The new Scylla Manager is:\n{}".format(scylla_manager_yaml))
        manager_node.remoter.sudo("systemctl restart scylla-manager")
        manager_node.wait_manager_server_up(port=new_manager_http_port)
        manager_tool = get_scylla_manager_tool(manager_node=manager_node)
        manager_tool.add_cluster(name="cluster_under_test",
                                 db_cluster=self.db_cluster,
                                 auth_token=self.monitors.mgmt_auth_token)
        current_manager_version = manager_tool.version

        LOGGER.debug("Generating load")
        self.generate_load_and_wait_for_results()

        mgr_cluster = manager_tool.get_cluster(
            cluster_name="cluster_under_test")

        with self.subTest("Creating reoccurring backup and repair tasks"):

            repair_task = mgr_cluster.create_repair_task(interval="1d")
            repair_task_current_details = wait_until_task_finishes_return_details(
                repair_task)

            backup_task = mgr_cluster.create_backup_task(
                interval="1d",
                location_list=self.locations,
                keyspace_list=["keyspace1"])
            backup_task_current_details = wait_until_task_finishes_return_details(
                backup_task)
            backup_task_snapshot = backup_task.get_snapshot_tag()
            pre_upgrade_backup_task_files = mgr_cluster.get_backup_files_dict(
                backup_task_snapshot)

        with self.subTest(
                "Creating a simple backup with the intention of purging it"):
            self.create_simple_table(table_name="cf1")
            self.write_multiple_rows(table_name="cf1", key_range=(1, 11))
            self.create_simple_table(table_name="cf2")
            self.write_multiple_rows(table_name="cf2", key_range=(1, 11))

            rerunning_backup_task = \
                mgr_cluster.create_backup_task(location_list=self.locations, keyspace_list=["ks1"], retention=2)
            rerunning_backup_task.wait_and_get_final_status(timeout=300,
                                                            step=20)
            assert rerunning_backup_task.status == TaskStatus.DONE, \
                f"Unknown failure in task {rerunning_backup_task.id}"

        with self.subTest("Creating a backup task and stopping it"):
            legacy_args = "--force" if manager_tool.client_version.startswith(
                "2.1") else None
            pausable_backup_task = mgr_cluster.create_backup_task(
                interval="1d",
                location_list=self.locations,
                keyspace_list=["system_*"],
                legacy_args=legacy_args,
            )
            pausable_backup_task.wait_for_status(
                list_status=[TaskStatus.RUNNING], timeout=180, step=2)
            pausable_backup_task.stop()

        upgrade_scylla_manager(
            pre_upgrade_manager_version=current_manager_version,
            target_upgrade_server_version=target_upgrade_server_version,
            target_upgrade_agent_version=target_upgrade_agent_version,
            manager_node=manager_node,
            db_cluster=self.db_cluster)

        LOGGER.debug(
            "Checking that the previously created tasks' details have not changed"
        )
        manager_tool = get_scylla_manager_tool(manager_node=manager_node)
        # make sure that the cluster is still added to the manager
        manager_tool.get_cluster(cluster_name="cluster_under_test")
        validate_previous_task_details(
            task=repair_task,
            previous_task_details=repair_task_current_details)
        validate_previous_task_details(
            task=backup_task,
            previous_task_details=backup_task_current_details)

        with self.subTest(
                "Continuing a older version stopped backup task with newer version manager"
        ):
            pausable_backup_task.start()
            pausable_backup_task.wait_and_get_final_status(timeout=1200,
                                                           step=20)
            assert pausable_backup_task.status == TaskStatus.DONE, \
                f"task {pausable_backup_task.id} failed to continue after manager upgrade"

        with self.subTest(
                "Restoring an older version backup task with newer version manager"
        ):
            self.verify_backup_success(mgr_cluster=mgr_cluster,
                                       backup_task=backup_task)

        with self.subTest(
                "Executing the 'backup list' and 'backup files' commands on a older version backup"
                " with newer version manager"):
            current_backup_files = mgr_cluster.get_backup_files_dict(
                backup_task_snapshot)
            assert pre_upgrade_backup_task_files == current_backup_files,\
                f"Backup task of the task {backup_task.id} is not identical after the manager upgrade:" \
                f"\nbefore the upgrade:\n{pre_upgrade_backup_task_files}\nafter the upgrade:\n{current_backup_files}"
            mgr_cluster.sctool.run(cmd=f" backup list -c {mgr_cluster.id}",
                                   is_verify_errorless_result=True)

        with self.subTest("purging a older version backup"):
            # Dropping one table
            with self.db_cluster.cql_connection_patient(
                    self.db_cluster.nodes[0]) as session:
                session.execute("DROP TABLE ks1.cf1 ;")

            for i in range(2, 4):
                LOGGER.debug("rerunning the backup task for the %s time", i)
                rerunning_backup_task.start(continue_task=False)
                rerunning_backup_task.wait_and_get_final_status(step=5)
                assert rerunning_backup_task.status == TaskStatus.DONE, \
                    f"backup {rerunning_backup_task.id} that was rerun again from the start has failed to reach " \
                    f"status DONE within expected time limit"
            per_node_backup_file_paths = mgr_cluster.get_backup_files_dict(
                snapshot_tag=rerunning_backup_task.get_snapshot_tag())
            for node in self.db_cluster.nodes:
                node_id = node.host_id
                # making sure that the files of the missing table isn't in s3
                assert "cf1" not in per_node_backup_file_paths[node_id]["ks1"], \
                    "The missing table is still in s3, even though it should have been purged"
Example #15
0
    def test_upgrade(self):  # pylint: disable=too-many-locals,too-many-statements
        target_upgrade_server_version = self.params.get(
            'target_scylla_mgmt_server_repo')
        target_upgrade_agent_version = self.params.get(
            'target_scylla_mgmt_agent_repo')
        manager_node = self.monitors.nodes[0]
        manager_tool = get_scylla_manager_tool(manager_node=manager_node)
        manager_tool.add_cluster(name="cluster_under_test",
                                 db_cluster=self.db_cluster,
                                 auth_token=self.monitors.mgmt_auth_token)
        current_manager_version = manager_tool.version

        LOGGER.debug("Generating load")
        self.generate_load_and_wait_for_results()

        mgr_cluster = manager_tool.get_cluster(
            cluster_name="cluster_under_test")

        with self.subTest("Creating reoccurring backup and repair tasks"):

            repair_task = mgr_cluster.create_repair_task(interval="1d")
            repair_task_current_details = wait_until_task_finishes_return_details(
                repair_task)

            self.update_all_agent_config_files()
            bucket_name = self.params.get('backup_bucket_location').split()[0]
            location_list = [f's3:{bucket_name}']
            backup_task = mgr_cluster.create_backup_task(
                interval="1d",
                location_list=location_list,
                keyspace_list=["keyspace1"])
            backup_task_current_details = wait_until_task_finishes_return_details(
                backup_task)
            backup_task_snapshot = backup_task.get_snapshot_tag()
            pre_upgrade_backup_task_files = mgr_cluster.get_backup_files_dict(
                backup_task_snapshot)

        with self.subTest("Creating a backup task and stopping it"):
            pausable_backup_task = mgr_cluster.create_backup_task(
                interval="1d",
                location_list=location_list,
                keyspace_list=["system_*"])
            pausable_backup_task.wait_for_status(
                list_status=[TaskStatus.RUNNING], timeout=180, step=2)
            pausable_backup_task.stop()

        with self.subTest(
                "Creating a simple backup with the intention of purging it"):
            self.create_simple_table(table_name="cf1")
            self.write_multiple_rows(table_name="cf1", key_range=(1, 11))
            self.create_simple_table(table_name="cf2")
            self.write_multiple_rows(table_name="cf2", key_range=(1, 11))

            bucket_name = self.params.get('backup_bucket_location').split()[0]
            location_list = [f's3:{bucket_name}']
            rerunning_backup_task = mgr_cluster.create_backup_task(
                location_list=location_list,
                keyspace_list=["ks1"],
                retention=2)
            rerunning_backup_task.wait_and_get_final_status(timeout=300,
                                                            step=20)
            assert rerunning_backup_task.status == TaskStatus.DONE, \
                f"Unknown failure in task {rerunning_backup_task.id}"

        upgrade_scylla_manager(
            pre_upgrade_manager_version=current_manager_version,
            target_upgrade_server_version=target_upgrade_server_version,
            target_upgrade_agent_version=target_upgrade_agent_version,
            manager_node=manager_node,
            db_cluster=self.db_cluster)

        LOGGER.debug(
            "Checking that the previously created tasks' details have not changed"
        )
        manager_tool = get_scylla_manager_tool(manager_node=manager_node)
        # make sure that the cluster is still added to the manager
        manager_tool.get_cluster(cluster_name="cluster_under_test")
        validate_previous_task_details(
            task=repair_task,
            previous_task_details=repair_task_current_details)
        validate_previous_task_details(
            task=backup_task,
            previous_task_details=backup_task_current_details)

        with self.subTest("Restoring a 2.0 backup task with 2.1 manager"):
            self.verify_backup_success(mgr_cluster=mgr_cluster,
                                       backup_task=backup_task)

        with self.subTest(
                "Continuing a 2.0 stopped backup task with 2.1 manager"):
            pausable_backup_task.start()
            pausable_backup_task.wait_and_get_final_status(timeout=1200,
                                                           step=20)
            assert pausable_backup_task.status == TaskStatus.DONE, \
                f"task {pausable_backup_task.id} failed to continue after manager upgrade"

        with self.subTest(
                "Executing the 'backup list' and 'backup files' commands on a 2.0 backup with 2.1 manager"
        ):
            current_backup_files = mgr_cluster.get_backup_files_dict(
                backup_task_snapshot)
            assert pre_upgrade_backup_task_files == current_backup_files,\
                f"Backup task of the task {backup_task.id} is not identical after the manager upgrade:" \
                f"\nbefore the upgrade:\n{pre_upgrade_backup_task_files}\nafter the upgrade:\n{current_backup_files}"
            mgr_cluster.sctool.run(cmd=f" backup list -c {mgr_cluster.id}",
                                   is_verify_errorless_result=True)

        with self.subTest("purging a 2.0 backup"):
            # Dropping one table
            with self.db_cluster.cql_connection_patient(
                    self.db_cluster.nodes[0]) as session:
                session.execute("DROP TABLE ks1.cf1 ;")

            for i in range(2, 4):
                LOGGER.debug(f"rerunning the backup task for the {i} time")
                rerunning_backup_task.start(continue_task=False)
            per_node_backup_file_paths = mgr_cluster.get_backup_files_dict(
                snapshot_tag=rerunning_backup_task.get_snapshot_tag())
            for node in self.db_cluster.nodes:
                nodetool_info = self.db_cluster.get_nodetool_info(node)
                node_id = nodetool_info['ID']
                # making sure that the files of the missing table isn't in s3
                assert "cf1" not in per_node_backup_file_paths[node_id]["ks1"], \
                    "The missing table is still in s3, even though it should have been purged"
    def test_intensity_and_parallel(self, fault_multiple_nodes):
        keyspace_to_be_repaired = "keyspace2"
        InfoEvent(message='starting test_intensity_and_parallel').publish()
        if not self.is_cred_file_configured:
            self.update_config_file()
        manager_tool = mgmt.get_scylla_manager_tool(
            manager_node=self.monitors.nodes[0])
        mgr_cluster = manager_tool.add_cluster(
            name=self.CLUSTER_NAME + '_intensity_and_parallel',
            db_cluster=self.db_cluster,
            auth_token=self.monitors.mgmt_auth_token,
        )

        InfoEvent(message="Starting faulty load (to be repaired)").publish()
        self.create_missing_rows_in_cluster(
            create_missing_rows_in_multiple_nodes=fault_multiple_nodes,
            keyspace_to_be_repaired=keyspace_to_be_repaired)

        InfoEvent(message="Starting a repair with no intensity").publish()
        base_repair_task = mgr_cluster.create_repair_task(keyspace="keyspace*")
        base_repair_task.wait_and_get_final_status(step=30)
        assert base_repair_task.status == TaskStatus.DONE, "The base repair task did not end in the expected time"
        InfoEvent(
            message=
            f"The base repair, with no intensity argument, took {base_repair_task.duration}"
        ).publish()

        with self.db_cluster.cql_connection_patient(
                self.db_cluster.nodes[0]) as session:
            session.execute(
                f"DROP KEYSPACE IF EXISTS {keyspace_to_be_repaired}")

        arg_list = [{
            "intensity": .5
        }, {
            "intensity": .25
        }, {
            "intensity": .0001
        }, {
            "intensity": 2
        }, {
            "intensity": 4
        }, {
            "parallel": 1
        }, {
            "parallel": 2
        }, {
            "intensity": 2,
            "parallel": 1
        }, {
            "intensity": 100
        }, {
            "intensity": 0
        }]

        for arg_dict in arg_list:
            InfoEvent(
                message="Starting faulty load (to be repaired)").publish()
            self.create_missing_rows_in_cluster(
                create_missing_rows_in_multiple_nodes=fault_multiple_nodes,
                keyspace_to_be_repaired=keyspace_to_be_repaired)

            InfoEvent(message=f"Starting a repair with {arg_dict}").publish()
            repair_task = mgr_cluster.create_repair_task(**arg_dict,
                                                         keyspace="keyspace*")
            repair_task.wait_and_get_final_status(step=30)
            InfoEvent(
                message=f"repair with {arg_dict} took {repair_task.duration}"
            ).publish()

            with self.db_cluster.cql_connection_patient(
                    self.db_cluster.nodes[0]) as session:
                session.execute(
                    f"DROP KEYSPACE IF EXISTS {keyspace_to_be_repaired}")
        InfoEvent(message='finishing test_intensity_and_parallel').publish()