def __init__(self, node: CephAdmin): """ initializes the env to run rados commands Args: node: CephAdmin object """ self.rados_obj = RadosOrchestrator(node=node)
def change_config_for_slow_ops(rados_obj: RadosOrchestrator, action: str, **kwargs): """ Changes few config Values on ceph cluster to intentionally increase changes of hitting slow_ops on the cluster network. Actions performed and rationale: * paxos_service_trim_min & paxos_service_trim_max set as mentioned in bz : https://bugzilla.redhat.com/show_bug.cgi?id=1943357#c0 * osd_op_complaint_time -> reducing the time threshold by which OSD should respond to requests * osd_max_backfills & osd_recovery_max_active -> Incrasing the number of threads for recovery & backfill as to reduce n/w bandwidth for client IO operations Args: rados_obj: Rados object for command execution action: weather to set the Config or to remove it from cluster Values : "set" -> to set the config values "rm" -> to remove the config changes made kwargs: Any other optional args that need to be passed Returns: Exception in case of failures """ value_map = { "paxos_service_trim_min": kwargs.get("paxos_service_trim_min", 10), "paxos_service_trim_max": kwargs.get("paxos_service_trim_max", 100), "osd_op_complaint_time": kwargs.get("osd_op_complaint_time", 0.000001), "osd_max_backfills": kwargs.get("osd_max_backfills", 8), "osd_recovery_max_active": kwargs.get("osd_recovery_max_active", 10), } cmd_map = { "paxos_service_trim_min": f"ceph config {action} mon paxos_service_trim_min", "paxos_service_trim_max": f"ceph config {action} mon paxos_service_trim_max", "osd_op_complaint_time": f"ceph config {action} osd osd_op_complaint_time", "osd_max_backfills": f"ceph config {action} osd osd_max_backfills", "osd_recovery_max_active": f"ceph config {action} osd osd_recovery_max_active", } # Removing the config values set when action is to remove if action == "rm": for cmd in cmd_map.keys(): rados_obj.node.shell([cmd_map[cmd]]) return # Adding the config values for val in cmd_map.keys(): cmd = f"{cmd_map[val]} {value_map[val]}" rados_obj.node.shell([cmd]) # Verifying the values set in the config config_dump = rados_obj.run_ceph_command(cmd="ceph config dump") for val in cmd_map.keys(): for conf in config_dump: if conf["name"] == val: if float(conf["value"]) != float(value_map[val]): error = f"Values do not match for config {conf['name']}" raise TestBedSetupFailure(error)
def run(ceph_cluster, **kw): """ Verifies the config change history in monitor configuration database changes Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) mon_obj = MonConfigMethods(rados_obj=rados_obj) # getting the last config change, to which we will roll back later init_config = mon_obj.get_ceph_log(count=1)[0] log.info("Config at the beginning of test. \n" f"Version: {init_config['version']}" f"Changes made: {init_config['changes']}") log.info( "Setting new changes and verifying if the changes are reflected in the log" ) if not mon_obj.set_config(section="osd", name="osd_max_scrubs", value="8"): log.error("Error setting config ") return 1 # Checking the versions and changes made. test_config = mon_obj.get_ceph_log(count=1)[0] log.info("Config changes made for test. \n" f"Version: {test_config['version']}" f"Changes made: {test_config['changes']}") if not test_config["version"] > init_config["version"]: log.error(f"The log is not updated with new config changes." f"Version: {test_config['version']}") return 1 try: name = test_config["changes"][0].get("name") value = str(test_config["changes"][0].get("new_value")) if not name == "osd/osd_max_scrubs" and value == "8": log.error(f"The log is not updated with new config changes." f"Changes made: {test_config['changes']}") return 1 except Exception: log.error( "The log collected does not contain the value and changes made") return 1 log.info("The ceph config log is successfully updated after changes ") return 0
def wait_for_clean_pg_sets(rados_obj: RadosOrchestrator) -> bool: """ Waiting for up to 2.5 hours for the PG's to enter active + Clean state after stretch changes Automation for bug : [1] & [2] Args: rados_obj: RadosOrchestrator object to run commands Returns: True -> pass, False -> fail """ end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000) while end_time > datetime.datetime.now(): flag = True status_report = rados_obj.run_ceph_command(cmd="ceph report") # Proceeding to check if all PG's are in active + clean for entry in status_report["num_pg_by_state"]: rec = ( "remapped", "backfilling", "degraded", "incomplete", "peering", "recovering", "recovery_wait", "undersized", "backfilling_wait", ) if any(key in rec for key in entry["state"].split("+")): flag = False if flag: log.info("The recovery and back-filling of the OSD is completed") return True log.info( f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()}," f"PG States : {status_report['num_pg_by_state']}" f" checking status again in 2 minutes" ) time.sleep(120) log.error("The cluster did not reach active + Clean state") return False
def run(ceph_cluster, **kw): """ Verifies the config change reverts in monitor configuration database changes taken from logs Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) mon_obj = MonConfigMethods(rados_obj=rados_obj) init_config = mon_obj.get_ceph_log(count=1)[0] if not mon_obj.set_config( section="mon", name="mon_max_log_epochs", value="1000"): log.error("Error setting config ") return 1 log.info( f"Proceeding with reverting the last config change, selecting version: {init_config['version']}" ) if not mon_obj.ceph_config_reset(version=init_config["version"]): log.error( f"Could not revert to the selected version : {init_config['version']}" ) return 1 log.info( "Reverted to selected version. Checking if the config value is removed" ) if mon_obj.verify_set_config(section="mon", name="mon_max_log_epochs", value="1000"): log.error("Config is still set after the reset") return 1 test_config = mon_obj.get_ceph_log(count=1)[0] log.info( f"reverted successfully to previous versions. config log : {test_config}" ) log.info("The ceph config log is successfully updated after changes ") return 0
def run(ceph_cluster, **kw): """ Prepares the cluster to run rados tests. Actions Performed: 1. Create a Replicated and Erasure coded pools and write Objects into pools 2. Setup email alerts for sending errors/warnings on the cluster. Verifies Bugs: https://bugzilla.redhat.com/show_bug.cgi?id=1849894 https://bugzilla.redhat.com/show_bug.cgi?id=1878145 3. Enable logging into file and check file permissions Verifies Bug : https://bugzilla.redhat.com/show_bug.cgi?id=1884469 Args: ceph_cluster (ceph.ceph.Ceph): ceph cluster kw: Args that need to be passed to the test for initialization Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) mon_obj = MonConfigMethods(rados_obj=rados_obj) out, err = cephadm.shell(["uuidgen"]) uuid = out.split("-")[0] if config.get("ec_pool"): ec_config = config.get("ec_pool") ec_config.setdefault("pool_name", f"ecpool_{uuid}") if not rados_obj.create_erasure_pool(name=uuid, **ec_config): log.error("Failed to create the EC Pool") return 1 if ec_config.get("test_overwrites_pool"): if not rados_obj.verify_ec_overwrites(**ec_config): log.error("Failed to create the EC Pool") return 1 else: if not rados_obj.bench_write(**ec_config): log.error("Failed to write objects into the EC Pool") return 1 rados_obj.bench_read(**ec_config) log.info( "Created the EC Pool, Finished writing data into the pool") if ec_config.get("delete_pool"): if not rados_obj.detete_pool(pool=ec_config["pool_name"]): log.error("Failed to delete EC Pool") return 1 if config.get("replicated_pool"): rep_config = config.get("replicated_pool") rep_config.setdefault("pool_name", f"repool_{uuid}") if not rados_obj.create_pool(**rep_config, ): log.error("Failed to create the replicated Pool") return 1 if not rados_obj.bench_write(**rep_config): log.error("Failed to write objects into the EC Pool") return 1 rados_obj.bench_read(**rep_config) log.info( "Created the replicated Pool, Finished writing data into the pool") if rep_config.get("delete_pool"): if not rados_obj.detete_pool(pool=rep_config["pool_name"]): log.error("Failed to delete replicated Pool") return 1 if config.get("set_pool_configs"): changes = config["set_pool_configs"] pool_name = changes["pool_name"] configurations = changes["configurations"] for conf in configurations.keys(): if not rados_obj.set_pool_property( pool=pool_name, props=conf, value=configurations[conf]): log.error(f"failed to set property {conf} on the cluster") return 1 log.info(f"made the config changes on the pool {pool_name}") if config.get("email_alerts"): alert_config = config.get("email_alerts") if not rados_obj.enable_email_alerts(**alert_config): log.error("Error while configuring email alerts") return 1 log.info("email alerts configured") if config.get("Verify_config_parameters"): test_config = config.get("Verify_config_parameters") test_node = ceph_cluster.get_nodes(role="osd")[0] for conf in test_config["configurations"]: for entry in conf.values(): if entry.get("location_type") == "host": entry["location_value"] = test_node.hostname if not mon_obj.set_config(**entry): log.error(f"Error setting config {conf}") return 1 log.info("done") pool_name = "test_pool_1" if not rados_obj.create_pool(pool_name=pool_name, pg_num=16): log.error("Failed to create the replicated Pool") return 1 rados_obj.bench_write(pool_name=pool_name, rados_write_duration=50) # Removing test configurations for conf in test_config["configurations"]: for entry in conf.values(): if entry.get("location_type") == "host": entry["location_value"] = test_node.hostname if not mon_obj.remove_config(**entry): log.error(f"Error setting config {conf}") return 1 log.info("finished removing values, passed") if config.get("log_to_file"): if not rados_obj.enable_file_logging(): log.error("Error while setting config to enable logging into file") return 1 log.info("Logging to file configured") if config.get("cluster_configuration_checks"): cls_config = config.get("cluster_configuration_checks") if not rados_obj.set_cluster_configuration_checks(**cls_config): log.error("Error while setting Cluster config checks") return 1 log.info("Set up cluster configuration checks") if config.get("configure_balancer"): balancer_config = config.get("configure_balancer") if not rados_obj.enable_balancer(**balancer_config): log.error("Error while setting up balancer on the Cluster") return 1 log.info("Set up Balancer on the cluster") if config.get("configure_pg_autoscaler"): autoscaler_config = config.get("configure_pg_autoscaler") if not rados_obj.configure_pg_autoscaler(**autoscaler_config): log.error("Error while setting up pg_autoscaler on the Cluster") return 1 log.info("Set up pg_autoscaler on the cluster") if config.get("enable_compression"): compression_conf = config["enable_compression"] pool_name = compression_conf["pool_name"] for conf in compression_conf["configurations"]: for entry in conf.values(): if not rados_obj.pool_inline_compression(pool_name=pool_name, **entry): log.error( f"Error setting compression on pool : {pool_name} for config {conf}" ) return 1 if not rados_obj.bench_write(**compression_conf): log.error("Failed to write objects into Pool") return 1 rados_obj.bench_read(**compression_conf) log.info( "Created the replicated Pool, Finished writing data into the pool" ) log.info("Completed compression tests") if config.get("delete_pools"): for name in config["delete_pools"]: if not rados_obj.detete_pool(name): log.error(f"the pool {name} could not be deleted") return 1 log.info("deleted all the given pools successfully") log.info("All Pre-requisites completed to run Rados suite") return 0
def run(ceph_cluster, **kw): """ Automates OSD re-balance test scenarios. 1. Create replicated and/or erasure pool/pools 2. Identify the osd to be removed 3. Fetch the host by daemon_type=osd and osd id 4. Fetch container id and device path 5. Mark osd out and wait for pgs to be active+clean 6. Remove OSD 7. Zap device and wait for device not present 8. Add OSD and wait for device present and pgs to be active+clean """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) client_node = ceph_cluster.get_nodes(role="client")[0] log.info("Running create pool test case") if config.get("create_pools"): pools = config.get("create_pools") for each_pool in pools: cr_pool = each_pool["create_pool"] if cr_pool.get("pool_type", "replicated") == "erasure": method_should_succeed(rados_obj.create_erasure_pool, name=cr_pool["pool_name"], **cr_pool) else: method_should_succeed(rados_obj.create_pool, pool_name=cr_pool["pool_name"], **cr_pool) method_should_succeed(rados_obj.bench_write, **cr_pool) pool = random.choice(pools)["create_pool"] if not pool: log.error("Failed to retrieve pool details") return 1 rados_obj.change_recover_threads(config=pool, action="set") acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool["pool_name"]) log.info(f"Acting set {acting_pg_set}") if not acting_pg_set: log.error("Failed to retrieve acting pg set") return 1 osd_id = acting_pg_set[0] host = rados_obj.fetch_host_node(daemon_type="osd", daemon_id=osd_id) if not host: log.error("Failed to fetch host details") return 1 # fetch container id out, _ = host.exec_command(sudo=True, cmd="podman ps --format json") container_id = [ item["Names"][0] for item in json.loads(out.read().decode()) if f"osd.{osd_id}" in item["Command"] ][0] if not container_id: log.error("Failed to retrieve container id") return 1 # fetch device path by osd_id vol_out, _ = host.exec_command( sudo=True, cmd=f"podman exec {container_id} ceph-volume lvm list --format json", ) volume_out = vol_out.read().decode() dev_path = [ v[0]["devices"][0] for k, v in json.loads(volume_out).items() if str(k) == str(osd_id) ][0] if not dev_path: log.error("Failed to get device path") return 1 log.debug( f"device path : {dev_path}, osd_id : {osd_id}, host.hostname : {host.hostname}" ) utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=True) method_should_succeed(utils.set_osd_out, ceph_cluster, osd_id) method_should_succeed(wait_for_clean_pg_sets, rados_obj) utils.osd_remove(ceph_cluster, osd_id) method_should_succeed(wait_for_clean_pg_sets, rados_obj) method_should_succeed(utils.zap_device, ceph_cluster, host.hostname, dev_path) method_should_succeed(wait_for_device, host, container_id, osd_id, action="remove") utils.add_osd(ceph_cluster, host.hostname, dev_path, osd_id) method_should_succeed(wait_for_device, host, container_id, osd_id, action="add") method_should_succeed(wait_for_clean_pg_sets, rados_obj) do_rados_put(mon=client_node, pool=pool["pool_name"], nobj=1000) method_should_succeed(wait_for_clean_pg_sets, rados_obj) utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=False) rados_obj.change_recover_threads(config=pool, action="rm") if config.get("delete_pools"): for name in config["delete_pools"]: method_should_succeed(rados_obj.detete_pool, name) log.info("deleted all the given pools successfully") return 0
def run(ceph_cluster, **kw): """ Test to create pool, then add , get , delete objects & Snapshots. Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) pool_obj = PoolFunctions(node=cephadm) client_node = rados_obj.ceph_cluster.get_nodes(role="client")[0] pool_target_configs = config["verify_client_pg_access"]["configurations"] num_snaps = config["verify_client_pg_access"]["num_snapshots"] log.debug( "Verifying the effects of rados put, get, snap & delete on pool with single PG" ) # Creating pools and starting the test for entry in pool_target_configs.values(): pool_name = entry["pool_name"] log.debug( f"Creating {entry['pool_type']} pool on the cluster with name {pool_name}" ) if entry.get("pool_type", "replicated") == "erasure": method_should_succeed(rados_obj.create_erasure_pool, name=pool_name, **entry) else: method_should_succeed( rados_obj.create_pool, **entry, ) # Creating and reading objects with parallel() as p: p.spawn(do_rados_put, client_node, pool_name, 500) p.spawn(do_rados_get, client_node, pool_name, 1) # Creating and deleting snapshots on the pool snapshots = [] for _ in range(num_snaps): snap = pool_obj.create_pool_snap(pool_name=pool_name) if snap: snapshots.append(snap) else: log.error("Could not create snapshot on the pool") return 1 if not pool_obj.delete_pool_snap(pool_name=pool_name): log.error("Could not delete the snapshots created") return 1 # Deleting the objects created on the pool if not pool_obj.do_rados_delete(pool_name=pool_name): log.error("Could not delete the objects present on pool") return 1 rados_obj.detete_pool(pool=pool_name) log.info(f"Completed all operations on pool {pool_name}") log.info( "Completed testing effects of rados put, get, snap & delete on pool with single PG" ) return 0
def run(ceph_cluster, **kw): """ Performs various pool related validation tests Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) mon_obj = MonConfigMethods(rados_obj=rados_obj) if config.get("ec_pool_recovery_improvement"): ec_config = config.get("ec_pool_recovery_improvement") if not rados_obj.create_erasure_pool(name="recovery", **ec_config): log.error("Failed to create the EC Pool") return 1 if not rados_obj.bench_write(**ec_config): log.error("Failed to write objects into the EC Pool") return 1 rados_obj.bench_read(**ec_config) log.info("Created the EC Pool, Finished writing data into the pool") # getting the acting set for the created pool acting_pg_set = rados_obj.get_pg_acting_set( pool_name=ec_config["pool_name"]) if len(acting_pg_set) != ec_config["k"] + ec_config["m"]: log.error( f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m" ) return 1 log.info( f" Acting set of the pool consists of OSD's : {acting_pg_set}") log.info( f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery" ) stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])] for osd_id in stop_osds: if not rados_obj.change_osd_state(action="stop", target=osd_id): log.error(f"Unable to stop the OSD : {osd_id}") return 1 log.info( "Stopped 'm' number of OSD's from, starting to wait for recovery") rados_obj.change_recover_threads(config=ec_config, action="set") # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down time.sleep(25) # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000) while end_time > datetime.datetime.now(): flag = True status_report = rados_obj.run_ceph_command(cmd="ceph report") # Proceeding to check if all PG's are in active + clean for entry in status_report["num_pg_by_state"]: rec = ( "backfilling", "degraded", "incomplete", "recovering", "recovery_wait", "backfilling_wait", "peered", "undersized", ) if any(key in rec for key in entry["state"].split("+")): flag = False if flag: log.info( "The recovery and back-filling of the OSD is completed") break log.info( f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()}," f"PG States : {status_report['num_pg_by_state']}" f" checking status again in 1 minute") time.sleep(60) # getting the acting set for the created pool after recovery acting_pg_set = rados_obj.get_pg_acting_set( pool_name=ec_config["pool_name"]) if len(acting_pg_set) != ec_config["k"] + ec_config["m"]: log.error( f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m" ) return 1 log.info( f" Acting set of the pool consists of OSD's : {acting_pg_set}") # Changing recovery threads back to default rados_obj.change_recover_threads(config=ec_config, action="rm") log.debug("Starting the stopped OSD's") for osd_id in stop_osds: if not rados_obj.change_osd_state(action="restart", target=osd_id): log.error(f"Unable to restart the OSD : {osd_id}") return 1 # Sleep for 5 seconds for OSD's to join the cluster time.sleep(5) if not flag: log.error( "The pool did not reach active + Clean state after recovery") return 1 # Deleting the pool created if not rados_obj.detete_pool(pool=ec_config["pool_name"]): log.error( f"the pool {ec_config['pool_name']} could not be deleted") return 1 log.info("Successfully tested EC pool recovery with K osd's surviving") return 0 if config.get("Compression_tests"): """ Create a 2 replicated pools: 1. Pool_1 : enable any compression algorithm(def snappy) and compression mode(aggressive/force). 2. Pool_2 : set compression mode to none Writing the same amount of data on 2 pools, size of pool with compression on would consume less space """ pool_config = config["Compression_tests"]["pool_config"] compression_config = config["Compression_tests"]["compression_config"] pool_1 = pool_config["pool-1"] pool_2 = pool_config["pool-2"] if config["Compression_tests"]["pool_type"] == "replicated": if not rados_obj.create_pool(pool_name=pool_1, **pool_config): log.error("could not create pool-1") return 1 if not rados_obj.create_pool(pool_name=pool_2, **pool_config): log.error("could not create pool-2") return 1 elif config["Compression_tests"]["pool_type"] == "erasure": pool_config["pool_name"] = pool_1 if not rados_obj.create_erasure_pool(name=pool_1, **pool_config): log.error("could not create pool-1") return 1 pool_config["pool_name"] = pool_2 if not rados_obj.create_erasure_pool(name=pool_2, **pool_config): log.error("could not create pool-2") return 1 del pool_config["pool_name"] log.debug("Created two pools to test compression") # Enabling compression on pool-1 if not rados_obj.pool_inline_compression(pool_name=pool_1, **compression_config): log.error( f"Error setting compression on pool : {pool_1} for config {compression_config}" ) return 1 # Writing the same amount of data into two pools if not rados_obj.bench_write(pool_name=pool_1, **pool_config): log.error( "Failed to write objects into Pool-1, with compression enabled" ) return 1 if not rados_obj.bench_write(pool_name=pool_2, **pool_config): log.error( "Failed to write objects into Pool-2, without compression enabled" ) return 1 # Sleeping for 5 seconds for status to be updated. time.sleep(5) log.debug( "Finished writing data into the two pools. Checking pool stats") try: pool_stats = rados_obj.run_ceph_command( cmd="ceph df detail")["pools"] pool_1_stats = [ detail for detail in pool_stats if detail["name"] == pool_1 ][0]["stats"] pool_2_stats = [ detail for detail in pool_stats if detail["name"] == pool_2 ][0]["stats"] except KeyError: log.error( "No stats about the pools requested found on the cluster") return 1 log.debug(f"Pool-1 stats: {pool_1_stats}") log.debug(f"Pool-2 stats: {pool_2_stats}") if pool_1_stats["compress_bytes_used"] < 0: log.error("No data stored under pool-1 is compressed") return 1 if pool_1_stats["kb_used"] >= pool_2_stats["kb_used"]: log.error("Compression has no effect on the pool size...") return 1 if config["Compression_tests"].get("verify_compression_ratio_set"): # added verification for test: CEPH-83571672 if not rados_obj.check_compression_size(pool_name=pool_1, **compression_config): log.error("data not compressed in accordance to ratio set") return 1 log.info("Pool size is less when compression is enabled") return 0 if config.get("check_autoscaler_profile"): """ Verifies that the default auto-scaler profile on 5.1 builds in scale-up Verifies bugs : 1. https://bugzilla.redhat.com/show_bug.cgi?id=2021738 """ build = config.get("build", config.get("rhbuild")) autoscale_conf = config.get("check_autoscaler_profile") regex = r"5.1-rhel-\d{1}" if re.search(regex, build): log.info( "Test running on 5.1 builds, checking the default autoscaler profile" ) if not mon_obj.verify_set_config(**autoscale_conf): log.error( f"The default value for autoscaler profile is not scale-up in buld {build}" ) return 1 log.info(f"Autoscale profile is scale-up in release : {build}") else: log.debug( f"The profile is already scale-up by default in release : {build}" ) return 0
def verify_mon_db_trim(ceph_cluster, node: CephAdmin, **kwargs): """ The Mon DB size should be reduced by removing the old mappings regularly. To verify this behaviour, Creating various scenarios where the DB would be updated with new mappings and verify it DB is getting trimmed. Verifies BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1905339 https://bugzilla.redhat.com/show_bug.cgi?id=1829646 https://bugzilla.redhat.com/show_bug.cgi?id=1972281 https://bugzilla.redhat.com/show_bug.cgi?id=1943357 https://bugzilla.redhat.com/show_bug.cgi?id=1766702 Args: ceph_cluster (ceph.ceph.Ceph): ceph cluster node: Cephadm node where the commands need to be executed kwargs: Any other KV pairs that need to be passed for testing Returns: Exception hit for failure conditions. """ # Creating rados object to run rados commands rados_obj = RadosOrchestrator(node=node) mon_nodes = ceph_cluster.get_nodes(role="mon") osd_nodes = ceph_cluster.get_nodes(role="osd") client_node = ceph_cluster.get_nodes(role="client")[0] installer_node = ceph_cluster.get_nodes(role="installer")[0] daemon_info = rados_obj.run_ceph_command(cmd="ceph orch ps") mon_daemons = [ entry for entry in daemon_info if entry["daemon_type"] == "mon" ] # Duration for which we will sleep after the mon DB changes are made and mon would have begun trimming old mappings mon_db_trim_wait_dur = 1200 # List to capture the mon db size throughout the duration of the test to check the variations in DB size mon_db_size_list = list() mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons)) # Collecting first and last commits to osdmap status = rados_obj.run_ceph_command(cmd="ceph report") init_commmits = { "osdmap_first_committed": float(status["osdmap_first_committed"]), "osdmap_last_committed": float(status["osdmap_last_committed"]), } # creating scenarios where the mon db would be updated with new info change_config_for_slow_ops(rados_obj=rados_obj, action="set", **kwargs) mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons)) # Starting read and write on by creating a test pool . pool_name = "test_pool_ops" if not rados_obj.create_pool(pool_name=pool_name, crush_rule="stretch_rule"): error = "failed to create pool to run IO" raise TestCaseFailureException(error) cmd = f"rados --no-log-to-stderr -b 1024 -p {pool_name} bench 400 write --no-cleanup &" client_node.exec_command(sudo=True, cmd=cmd) mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons)) # deleting a previously created pool to increase OSD operations and map changes # Pool created as part of suite set-up workflow. rados_obj.detete_pool(pool="delete_pool") # Proceeding to reboot 1 OSD from each host to trigger rebalance & Backfill cluster_fsid = rados_obj.run_ceph_command(cmd="ceph fsid")["fsid"] daemon_info = rados_obj.run_ceph_command(cmd="ceph orch ps") osd_daemons = [ entry for entry in daemon_info if entry["daemon_type"] == "osd" ] for onode in osd_nodes: for osd in osd_daemons: if re.search(osd["hostname"], onode.hostname): # Not using the container ID's provided in ceph orch ps command. # Bug : https://bugzilla.redhat.com/show_bug.cgi?id=1943494 # cmd = f"podman restart {osd['container_id']}" cmd = f"systemctl restart ceph-{cluster_fsid}@osd.{osd['daemon_id']}.service" log.info( f"rebooting osd-{osd['daemon_id']} on host {osd['hostname']}. Command {cmd}" ) onode.exec_command(sudo=True, cmd=cmd) # Sleeping for 5 seconds for status to be updated time.sleep(5) break # Re-weighting the OSd's based on usage to trigger rebalance # todo: Verify re-balancing process on OSD's ( PG movement across cluster) # todo: Add re-balancing based on crush item provided # BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1766702 rados_obj.reweight_crush_items() """ Waiting for 2 hours for cluster to get to active + clean state. Rationale: during cluster activities like backfill, rebalance, osd map change cause Mon DB to be updated. Hence, we can wait till mon DB updates are completed, after which DB size should be reduced, by trimming the old mappings once the new mappings are added. If cluster healthy state is reached within 2 hours, we exit the loop earlier, without waiting for stipulated time, But if cluster is still performing operations for long time, we would need at-least some data to make sure DB is not just increasing. ( DB is expected to increase when operations are in progress. Old mappings are removed when operations/ new updates are completed. ) """ end_time = datetime.datetime.now() + datetime.timedelta(seconds=7200) while end_time > datetime.datetime.now(): status_report = rados_obj.run_ceph_command(cmd="ceph report") ceph_health_status = status_report["health"] recovery_tuple = ("OSD_DOWN", "PG_AVAILABILITY", "PG_DEGRADED", "SLOW_OPS") daemon_info = rados_obj.run_ceph_command(cmd="ceph orch ps") mon_daemons = [ entry for entry in daemon_info if entry["daemon_type"] == "mon" ] mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons)) # Checking for any health warnings that increase db size flag = (True if not any(key in ceph_health_status["checks"].keys() for key in recovery_tuple) else False) # Proceeding to check if all PG's are in active + clean if flag: for entry in status_report["num_pg_by_state"]: rec = ("remapped", "backfilling", "degraded") if any(key in rec for key in entry["state"].split("+")): flag = False if flag: log.info( f"The recovery and back-filling of the OSD is completed" f"Sleeping for {mon_db_trim_wait_dur} Seconds after clean for mon to remove old mappings" ) time.sleep(mon_db_trim_wait_dur / 2) mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons)) time.sleep(mon_db_trim_wait_dur) break log.info( f"Waiting for active + clean. Active aletrs: {ceph_health_status['checks'].keys()}," f" checking status again in 1 minute") time.sleep(60) # Checking if any slow operations are reported and waiting for 'dur' for the slow_ops to be cleared end_time = datetime.datetime.now() + datetime.timedelta( seconds=mon_db_trim_wait_dur) while end_time > datetime.datetime.now(): if not get_slow_ops_data( node=node, installer=installer_node, action="current"): log.info("Operations in progress, checking again in 30 seconds") time.sleep(30) continue # Logging all the historic operations for reference / future enhancement get_slow_ops_data(node=node, installer=installer_node, action="historic") break # collecting the final size of the Mon DB and the OSD map epoch times daemon_info = rados_obj.run_ceph_command(cmd="ceph orch ps") mon_daemons = [ entry for entry in daemon_info if entry["daemon_type"] == "mon" ] final_db_size = get_mondb_size(mon_nodes[0], mon_daemons) final_status = rados_obj.run_ceph_command(cmd="ceph report") final_commmits = { "osdmap_first_committed": float(final_status["osdmap_first_committed"]), "osdmap_last_committed": float(final_status["osdmap_last_committed"]), } mon_db_max_size = max(mon_db_size_list) # Getting the trend of mon DB size when the operations were running on cluster. mon_db_size_size_change = list() for i in range(len(mon_db_size_list) - 1): mon_db_size_size_change.append(mon_db_size_list[i + 1] - mon_db_size_list[i]) # Reverting the config changes made for generation slow_ops change_config_for_slow_ops(rados_obj=rados_obj, action="rm", **kwargs) # Checking the final results if True not in list(map(lambda x: x <= 0, mon_db_size_size_change)): error = f"The mon DB is only increasing since the test begun. DB sizes {mon_db_size_list}" raise TestCaseFailureException(error) if not final_db_size <= mon_db_max_size: error = ( f"The mon DB size after cluster clean is higher than when operations were being performed.\n" f"max size during operations : {mon_db_max_size} , final DB size after clean {final_db_size}" ) log.error(error) raise TestCaseFailureException() # Initial update of OSD maps can be the same at the beginning and end of test if (final_commmits["osdmap_first_committed"] < init_commmits["osdmap_first_committed"]): error = ( f"The OSD map has not been updated of first commits\n" f"The commits are initial commits : {init_commmits}, final commits : {final_commmits}" ) log.error(error) raise TestCaseFailureException() # Final updates need to be more than initial updates as there are OSD map changes during the duration of test if (final_commmits["osdmap_last_committed"] <= init_commmits["osdmap_last_committed"]): error = ( f"The OSD map has not been updated of last commits\n" f"The commits are initial commits : {init_commmits}, final commits : {final_commmits}" ) log.error(error) raise TestCaseFailureException() # The number of OSD mappings present on cluster should not exceed 800 in total # https://tracker.ceph.com/issues/37875#note-1 if (final_commmits["osdmap_last_committed"] - final_commmits["osdmap_first_committed"]) > 800: error = ( f"There are still too many old commits in Mon DB. OSD map not trimmed as per needed\n" f"The commits are initial commits : {init_commmits}, final commits : {final_commmits}" ) log.error(error) raise TestCaseFailureException() # Checking the paxos trimming sizes if (int(final_status["paxos"]["last_committed"]) - int(final_status["paxos"]["first_committed"])) > 1000: error = ( f"There are still too many old commits in Mon DB.\n" f"The commits are initial commits : {final_status['paxos']['first_committed']}," f" final commits : {final_status['paxos']['last_committed']}") log.error(error) raise TestCaseFailureException() log.info("mon DB was trimmed successfully")
def run(ceph_cluster, **kw): """ Performs various pool related validation tests Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) mon_obj = MonConfigMethods(rados_obj=rados_obj) pool_obj = PoolFunctions(node=cephadm) if config.get("ec_pool_recovery_improvement"): ec_config = config.get("ec_pool_recovery_improvement") if not rados_obj.create_erasure_pool(name="recovery", **ec_config): log.error("Failed to create the EC Pool") return 1 if not rados_obj.bench_write(**ec_config): log.error("Failed to write objects into the EC Pool") return 1 rados_obj.bench_read(**ec_config) log.info("Created the EC Pool, Finished writing data into the pool") # getting the acting set for the created pool acting_pg_set = rados_obj.get_pg_acting_set( pool_name=ec_config["pool_name"]) if len(acting_pg_set) != ec_config["k"] + ec_config["m"]: log.error( f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m" ) return 1 log.info( f" Acting set of the pool consists of OSD's : {acting_pg_set}") log.info( f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery" ) stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])] for osd_id in stop_osds: if not rados_obj.change_osd_state(action="stop", target=osd_id): log.error(f"Unable to stop the OSD : {osd_id}") return 1 log.info( "Stopped 'm' number of OSD's from, starting to wait for recovery") rados_obj.change_recover_threads(config=ec_config, action="set") # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down time.sleep(25) # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000) while end_time > datetime.datetime.now(): flag = True status_report = rados_obj.run_ceph_command(cmd="ceph report") # Proceeding to check if all PG's are in active + clean for entry in status_report["num_pg_by_state"]: rec = ( "backfilling", "degraded", "incomplete", "recovering", "recovery_wait", "backfilling_wait", "peered", "undersized", ) if any(key in rec for key in entry["state"].split("+")): flag = False if flag: log.info( "The recovery and back-filling of the OSD is completed") break log.info( f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()}," f"PG States : {status_report['num_pg_by_state']}" f" checking status again in 1 minute") time.sleep(60) # getting the acting set for the created pool after recovery acting_pg_set = rados_obj.get_pg_acting_set( pool_name=ec_config["pool_name"]) if len(acting_pg_set) != ec_config["k"] + ec_config["m"]: log.error( f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m" ) return 1 log.info( f" Acting set of the pool consists of OSD's : {acting_pg_set}") # Changing recovery threads back to default rados_obj.change_recover_threads(config=ec_config, action="rm") log.debug("Starting the stopped OSD's") for osd_id in stop_osds: if not rados_obj.change_osd_state(action="restart", target=osd_id): log.error(f"Unable to restart the OSD : {osd_id}") return 1 # Sleep for 5 seconds for OSD's to join the cluster time.sleep(5) if not flag: log.error( "The pool did not reach active + Clean state after recovery") return 1 # Deleting the pool created if not rados_obj.detete_pool(pool=ec_config["pool_name"]): log.error( f"the pool {ec_config['pool_name']} could not be deleted") return 1 log.info("Successfully tested EC pool recovery with K osd's surviving") return 0 if config.get("Compression_tests"): """ Create a 2 replicated pools: 1. Pool_1 : enable any compression algorithm(def snappy) and compression mode(aggressive/force). 2. Pool_2 : set compression mode to none Writing the same amount of data on 2 pools, size of pool with compression on would consume less space """ pool_config = config["Compression_tests"]["pool_config"] compression_config = config["Compression_tests"]["compression_config"] pool_1 = pool_config["pool-1"] pool_2 = pool_config["pool-2"] if config["Compression_tests"]["pool_type"] == "replicated": if not rados_obj.create_pool(pool_name=pool_1, **pool_config): log.error("could not create pool-1") return 1 if not rados_obj.create_pool(pool_name=pool_2, **pool_config): log.error("could not create pool-2") return 1 elif config["Compression_tests"]["pool_type"] == "erasure": pool_config["pool_name"] = pool_1 if not rados_obj.create_erasure_pool(name=pool_1, **pool_config): log.error("could not create pool-1") return 1 pool_config["pool_name"] = pool_2 if not rados_obj.create_erasure_pool(name=pool_2, **pool_config): log.error("could not create pool-2") return 1 del pool_config["pool_name"] log.debug("Created two pools to test compression") # Enabling compression on pool-1 if not rados_obj.pool_inline_compression(pool_name=pool_1, **compression_config): log.error( f"Error setting compression on pool : {pool_1} for config {compression_config}" ) return 1 # Writing the same amount of data into two pools if not rados_obj.bench_write(pool_name=pool_1, **pool_config): log.error( "Failed to write objects into Pool-1, with compression enabled" ) return 1 if not rados_obj.bench_write(pool_name=pool_2, **pool_config): log.error( "Failed to write objects into Pool-2, without compression enabled" ) return 1 # Sleeping for 5 seconds for status to be updated. time.sleep(5) log.debug( "Finished writing data into the two pools. Checking pool stats") try: pool_stats = rados_obj.run_ceph_command( cmd="ceph df detail")["pools"] pool_1_stats = [ detail for detail in pool_stats if detail["name"] == pool_1 ][0]["stats"] pool_2_stats = [ detail for detail in pool_stats if detail["name"] == pool_2 ][0]["stats"] except KeyError: log.error( "No stats about the pools requested found on the cluster") return 1 log.debug(f"Pool-1 stats: {pool_1_stats}") log.debug(f"Pool-2 stats: {pool_2_stats}") if pool_1_stats["compress_bytes_used"] < 0: log.error("No data stored under pool-1 is compressed") return 1 if pool_1_stats["kb_used"] >= pool_2_stats["kb_used"]: log.error("Compression has no effect on the pool size...") return 1 if config["Compression_tests"].get("verify_compression_ratio_set"): # added verification for test: CEPH-83571672 if not rados_obj.check_compression_size(pool_name=pool_1, **compression_config): log.error("data not compressed in accordance to ratio set") return 1 log.info("Pool size is less when compression is enabled") return 0 if config.get("test_autoscaler_bulk_feature"): """ Tests to verify the autoscaler bulk flag, which allows pools to make use of scale-down profile, making those pools start with full compliments of PG sets. Tests include 1. creating new pools with bulk, 2. enabling/disabling bulk flag on existing pools 3. Verify the PG changes when the flag is set/unset Verifies bugs : https://bugzilla.redhat.com/show_bug.cgi?id=2049851 """ regex = r"\s*(\d.\d)-rhel-\d" build = (re.search(regex, config.get("build", config.get("rhbuild")))).groups()[0] if not float(build) > 5.0: log.info( "Test running on version less than 5.1, skipping verifying bulk flags" ) return 0 # Creating a pool with bulk feature pool_name = config.get("pool_name") if not pool_obj.set_bulk_flag(pool_name=pool_name): log.error("Failed to create a pool with bulk features") return 1 # Checking the autoscaler status, final PG counts, bulk flags pg_target_init = pool_obj.get_target_pg_num_bulk_flag( pool_name=pool_name) # Unsetting the bulk flag and checking the change in the PG counts if not pool_obj.rm_bulk_flag(pool_name=pool_name): log.error("Failed to create a pool with bulk features") return 1 # Sleeping for 5 seconds for new PG num to bets et time.sleep(5) pg_target_interim = pool_obj.get_target_pg_num_bulk_flag( pool_name=pool_name) # The target PG's once the flag is disabled must be lesser than when enabled if pg_target_interim >= pg_target_init: log.error("PG's not reduced after bulk flag disabled") return 1 # Setting the bulk flag on pool again and checking the change in the PG counts if not pool_obj.set_bulk_flag(pool_name=pool_name): log.error("Failed to disable/remove bulk features on pool") return 1 # Sleeping for 5 seconds for new PG num to bets et time.sleep(5) pg_target_final = pool_obj.get_target_pg_num_bulk_flag( pool_name=pool_name) # The target PG's once the flag is disabled must be lesser than when enabled if pg_target_interim >= pg_target_final: log.error("PG's not Increased after bulk flag Enabled") return 1 if config.get("delete_pool"): rados_obj.detete_pool(pool=pool_name) log.info("Verified the workings of bulk flag") return 0 if config.get("verify_pool_target_ratio"): log.debug("Verifying target size ratio on pools") target_configs = config["verify_pool_target_ratio"]["configurations"] # Creating pools and starting the test for entry in target_configs.values(): log.debug(f"Creating {entry['pool_type']} pool on the cluster") if entry.get("pool_type", "replicated") == "erasure": method_should_succeed(rados_obj.create_erasure_pool, name=entry["pool_name"], **entry) else: method_should_succeed( rados_obj.create_pool, **entry, ) rados_obj.bench_write(**entry) if not pool_obj.verify_target_ratio_set( pool_name=entry["pool_name"], ratio=entry["target_size_ratio"]): log.error( f"Could not change the target ratio on the pool: {entry['pool_name']}" ) return 1 log.debug("Set the ratio. getting the projected pg's") rados_obj.change_recover_threads(config=config, action="set") log.debug( "Waiting for the rebalancing to complete on the cluster after the change" ) # Sleeping for 2 minutes for rebalancing to start & for new PG count to be updated. time.sleep(120) new_pg_count = int( pool_obj.get_pg_autoscaler_value(pool_name=entry["pool_name"], item="pg_num_target")) if new_pg_count <= entry["pg_num"]: log.error( f"Count of PG's not increased on the pool: {entry['pool_name']}" f"Initial creation count : {entry['pg_num']}" f"New count after setting num target : {new_pg_count}") return 1 res = wait_for_clean_pg_sets(rados_obj) if not res: log.error( "PG's in cluster are not active + Clean after the ratio change" ) return 1 if not pool_obj.verify_target_ratio_set( pool_name=entry["pool_name"], ratio=0.0): log.error( f"Could not remove the target ratio on the pool: {entry['pool_name']}" ) return 1 # Sleeping for 2 minutes for rebalancing to start & for new PG count to be updated. time.sleep(120) # Checking if after the removal of ratio, the PG count has reduced end_pg_count = int( pool_obj.get_pg_autoscaler_value(pool_name=entry["pool_name"], item="pg_num_target")) if end_pg_count >= new_pg_count: log.error( f"Count of PG's not changed/ reverted on the pool: {entry['pool_name']}" f" after removing the target ratios") return 1 rados_obj.change_recover_threads(config=config, action="rm") if entry.get("delete_pool", False): rados_obj.detete_pool(pool=entry["pool_name"]) log.info( f"Completed the test of target ratio on pool: {entry['pool_name']} " ) log.info("Target ratio tests completed") return 0 if config.get("verify_mon_target_pg_per_osd"): pg_conf = config.get("verify_mon_target_pg_per_osd") if not mon_obj.set_config(**pg_conf): log.error("Could not set the value for mon_target_pg_per_osd ") return 1 mon_obj.remove_config(**pg_conf) log.info("Set and verified the value for mon_target_pg_per_osd ") return 0 if config.get("verify_pg_num_min"): log.debug("Verifying pg_num_min on pools") target_configs = config["verify_pg_num_min"]["configurations"] # Creating pools and starting the test for entry in target_configs.values(): log.debug(f"Creating {entry['pool_type']} pool on the cluster") if entry.get("pool_type", "replicated") == "erasure": method_should_succeed(rados_obj.create_erasure_pool, name=entry["pool_name"], **entry) else: method_should_succeed( rados_obj.create_pool, **entry, ) rados_obj.bench_write(**entry) if not rados_obj.set_pool_property(pool=entry["pool_name"], props="pg_num_min", value=entry["pg_num_min"]): log.error("Could not set the pg_min_size on the pool") return 1 if entry.get("delete_pool", False): rados_obj.detete_pool(pool=entry["pool_name"]) log.info( f"Completed the test of pg_min_num on pool: {entry['pool_name']} " ) log.info("pg_min_num tests completed") return 0
def run(ceph_cluster, **kw): """ Changes b/w various election strategies and observes mon quorum behaviour Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) mon_obj = MonElectionStrategies(rados_obj=rados_obj) cephadm_node_mon = ceph_cluster.get_nodes(role="installer")[0] # Collecting the number of mons in the quorum before the test mon_init_count = len(mon_obj.get_mon_quorum().keys()) # By default, the election strategy is classic. Verifying that strategy = mon_obj.get_election_strategy() if strategy != 1: log.error( f"cluster created election strategy other than classic, i.e {strategy}" ) return 1 # Changing strategy to 2. i.e disallowed mode. if not mon_obj.set_election_strategy(mode="disallow"): log.error("could not set election strategy to disallow mode") return 1 # sleeping for 2 seconds for new elections to be triggered and new leader to be elected time.sleep(2) log.info("Set election strategy to disallow mode. adding disallowed mons") # Checking if new leader will be chosen if leader is added to disallowed list old_leader = mon_obj.get_mon_quorum_leader() if not mon_obj.set_disallow_mon(mon=old_leader): log.error(f"could not add mon: {old_leader} to the disallowed list") return 1 # sleeping for 2 seconds for new elections to be triggered and new leader to be elected time.sleep(2) current_leader = mon_obj.get_mon_quorum_leader() if re.search(current_leader, old_leader): log.error( f"The mon: {old_leader} added to disallow list is still leader") return 1 # removing the mon from the disallowed list if not mon_obj.remove_disallow_mon(mon=old_leader): log.error(f"could not remove mon: {old_leader} from disallowed list") return 1 # sleeping for 2 seconds for new elections to be triggered and new leader to be elected time.sleep(2) # Changing strategy to 3. i.e Connectivity mode. if not mon_obj.set_election_strategy(mode="connectivity"): log.error("could not set election strategy to connectivity mode") return 1 # Checking connectivity scores of all the mons cmd = f"ceph daemon mon.{cephadm_node_mon.hostname} connection scores dump" rados_obj.run_ceph_command(cmd=cmd) # Changing strategy to default if not mon_obj.set_election_strategy(mode="classic"): log.error("could not set election strategy to classic mode") return 1 # sleeping for 5 seconds for new elections to be triggered and new leader to be elected time.sleep(5) # Collecting the number of mons in the quorum after the test # todo: add other tests to ascertain the health of mon daemons in quorum mon_final_count = len(mon_obj.get_mon_quorum().keys()) if mon_init_count < mon_final_count: log.error( "There are less mons in the quorum at the end than there before") return 1 log.info("Completed all mon election test cases") return 0
def run(ceph_cluster, **kw): """ enables connectivity mode and deploys stretch cluster with arbiter mon node Actions Performed: 1. Disables the automatic crush map update 2. Collects the OSD daemons in the cluster and split them into 2 sites. 3. If add capacity is selected, only half of the OSD's will be added to various sites initially. 4. Adds the stretch rule into crush map. 5. Adding monitors into the 2 sites. 6. Create a replicated pool and deploy stretch mode. 7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites ) 8. Check for the bump in election epochs throughout. 9. Check the acting set in PG for 4 OSD's. 2 from each site. Verifies bugs: [1]. https://bugzilla.redhat.com/show_bug.cgi?id=1937088 [2]. https://bugzilla.redhat.com/show_bug.cgi?id=1952763 Args: ceph_cluster (ceph.ceph.Ceph): ceph cluster """ log.info("Deploying stretch cluster with arbiter mon node") log.info(run.__doc__) config = kw.get("config") cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) mon_obj = MonElectionStrategies(rados_obj=rados_obj) client_node = ceph_cluster.get_nodes(role="client")[0] tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0] if not client_node and not tiebreaker_node: log.error( "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster" ) return 1 mon_state = get_mon_details(node=cephadm) if len(list(mon_state["monitors"])) < 5: log.error( f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}" ) return 1 osd_details = get_osd_details(node=cephadm) if len(osd_details.keys()) < 4: log.error( f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}" ) return 1 if config.get("verify_forced_recovery"): log.info("Verifying forced recovery and healthy in stretch environment") pool_name = "stretch_pool_recovery" if not rados_obj.create_pool(pool_name=pool_name, pg_num=16): log.error("Failed to create the replicated Pool") return 1 # getting the acting set for the created pool acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool_name) log.info( f"Killing 2 OSD's from acting set : {acting_pg_set} to verify recovery" ) stop_osds = [acting_pg_set.pop() for _ in range(2)] for osd_id in stop_osds: if not rados_obj.change_osd_state(action="stop", target=osd_id): log.error(f"Unable to stop the OSD : {osd_id}") return 1 # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down time.sleep(25) log.info("Stopped 2 OSD's from acting set, starting to wait for recovery") rados_obj.change_recover_threads(config=config, action="set") if not rados_obj.bench_write(pool_name=pool_name, **config): log.error("Failed to write objects into the Pool") return 1 log.debug("Triggering forced recovery in stretch mode") cmd = "ceph osd force_recovery_stretch_mode --yes-i-really-mean-it" rados_obj.run_ceph_command(cmd) log.info("Triggered the recovery in stretch mode") log.debug("Starting the stopped OSD's") for osd_id in stop_osds: if not rados_obj.change_osd_state(action="restart", target=osd_id): log.error(f"Unable to restart the OSD : {osd_id}") return 1 # there was data written into pool when the OSD's were down. # Verifying if data is recovered and placed into the OSD's after bringing them back res = wait_for_clean_pg_sets(rados_obj) if not res: log.error("PG's in cluster are not active + Clean ") return 1 log.debug("Forcing the stretch cluster into healthy mode") cmd = "ceph osd force_healthy_stretch_mode --yes-i-really-mean-it" rados_obj.run_ceph_command(cmd) log.info("Cluster has successfully recovered and is in healthy state") return 0 # Finding and Deleting any stray EC pools that might have been left on cluster pool_dump = rados_obj.run_ceph_command(cmd="ceph osd dump") for entry in pool_dump["pools"]: if entry["type"] != 1 and entry["crush_rule"] != 0: log.info( f"A non-replicated pool found : {entry['pool_name']}, proceeding to delete pool" ) if not rados_obj.detete_pool(pool=entry["pool_name"]): log.error(f"the pool {entry['pool_name']} could not be deleted") return 1 log.debug("No pools other than replicated found on cluster") # disabling automatic crush update cmd = "ceph config set osd osd_crush_update_on_start false" cephadm.shell([cmd]) site1 = config.get("site1", "site1") site2 = config.get("site2", "site2") # Collecting osd details and split them into Sita A and Site B sorted_osds = sort_osd_sites(all_osd_details=osd_details) site_a_osds = sorted_osds[0] site_b_osds = sorted_osds[1] if config.get("perform_add_capacity"): site_a_osds = sorted_osds[0][: (len(sorted_osds[0]) // 2)] site_b_osds = sorted_osds[1][: (len(sorted_osds[1]) // 2)] if not set_osd_sites( node=cephadm, osds=site_a_osds, site=site1, all_osd_details=osd_details, ): log.error("Failed to move the OSD's into sites") return 1 if not set_osd_sites( node=cephadm, osds=site_b_osds, site=site2, all_osd_details=osd_details, ): log.error("Failed to move the OSD's into sites") return 1 # collecting mon map to be compared after strtech cluster deployment stretch_rule_name = "stretch_rule" if not setup_crush_rule( node=client_node, rule_name=stretch_rule_name, site1=site1, site2=site2 ): log.error("Failed to Add crush rules in the crush map") return 1 # Setting the election strategy to connectivity mode if not mon_obj.set_election_strategy(mode="connectivity"): log.error("could not set election strategy to connectivity mode") return 1 # Sleeping for 5 sec for the strategy to be active time.sleep(5) init_mon_state = get_mon_details(node=cephadm) # Checking if mon elections happened after changing election strategy if mon_state["epoch"] > init_mon_state["epoch"]: log.error("Election epoch not bumped up after setting the connectivity mode.") return 1 # Checking updated election strategy in mon map strategy = mon_obj.get_election_strategy() if strategy != 3: log.error( f"cluster created election strategy other than connectivity, i.e {strategy}" ) return 1 log.info("Enabled connectivity mode on the cluster") log.info(f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3") if not set_mon_sites( node=cephadm, tiebreaker_node=tiebreaker_node, site1=site1, site2=site2 ): log.error("Failed to ad monitors into respective sites") return 1 # All the existing pools should be automatically changed with stretch rule. Creating a test pool pool_name = "test_pool_1" if not rados_obj.create_pool(pool_name=pool_name, pg_num=16): log.error("Failed to create the replicated Pool") return 1 log.info("Monitors added to respective sites. enabling stretch rule") cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter" try: cephadm.shell([cmd]) except Exception as err: log.error( f"Error while enabling stretch rule on the datacenter. Command : {cmd}" ) log.error(err) return 1 if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]: log.error("Election epoch not bumped up after Enabling strech mode") return 1 # Increasing backfill/rebalance threads so that cluster will re-balance it faster rados_obj.change_recover_threads(config=config, action="set") # wait for active + clean after deployment of stretch mode # checking the state after deployment coz of BZ : https://bugzilla.redhat.com/show_bug.cgi?id=2025800 res = wait_for_clean_pg_sets(rados_obj) if not res: status_report = rados_obj.run_ceph_command(cmd="ceph report") # Proceeding to check if all PG's are in active + clean for entry in status_report["num_pg_by_state"]: rec = ("remapped", "peering") if any(key in rec for key in entry["state"].split("+")): log.error( "PG's in cluster are stuck in remapped+peering after stretch deployment." ) return 1 if config.get("perform_add_capacity"): pool_name = "test_stretch_pool" if not rados_obj.create_pool( pool_name=pool_name, crush_rule=stretch_rule_name, ): log.error("Failed to create the replicated Pool") return 1 do_rados_put(mon=client_node, pool=pool_name, nobj=100) log.info("Performing add Capacity after the deployment of stretch cluster") site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds] site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds] if not set_osd_sites( node=cephadm, osds=site_a_osds, site=site1, all_osd_details=osd_details, ): log.error("Failed to move the OSD's into sites") return 1 if not set_osd_sites( node=cephadm, osds=site_b_osds, site=site2, all_osd_details=osd_details, ): log.error("Failed to move the OSD's into sites") return 1 flag = wait_for_clean_pg_sets(rados_obj) if not flag: log.error( "The cluster did not reach active + Clean state after add capacity" ) return 1 with parallel() as p: p.spawn(do_rados_get, client_node, pool_name, 10) for res in p: log.info(res) log.info("Successfully completed Add Capacity scenario") rados_obj.change_recover_threads(config=config, action="rm") # Checking if the pools have been updated with the new crush rules acting_set = rados_obj.get_pg_acting_set(pool_name=pool_name) if len(acting_set) != 4: log.error( f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4" ) return 1 log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG") log.info("Stretch rule with arbiter monitor node set up successfully") return 0
def run(ceph_cluster, **kw): """ Test to create a large number of omap entries on the single PG pool and test osd resiliency Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) pool_obj = PoolFunctions(node=cephadm) pool_target_configs = config["verify_osd_omap_entries"]["configurations"] omap_target_configs = config["verify_osd_omap_entries"]["omap_config"] # Creating pools and starting the test for entry in pool_target_configs.values(): log.debug( f"Creating {entry['pool_type']} pool on the cluster with name {entry['pool_name']}" ) if entry.get("pool_type", "replicated") == "erasure": method_should_succeed(rados_obj.create_erasure_pool, name=entry["pool_name"], **entry) else: method_should_succeed( rados_obj.create_pool, **entry, ) log.debug( "Created the pool. beginning to create large number of omap entries on the pool" ) if not pool_obj.fill_omap_entries(pool_name=entry["pool_name"], **omap_target_configs): log.error( f"Omap entries not generated on pool {entry['pool_name']}") return 1 # Fetching the current acting set for the pool acting_set = rados_obj.get_pg_acting_set(pool_name=entry["pool_name"]) rados_obj.change_recover_threads(config={}, action="set") log.debug( f"Proceeding to restart OSd's from the acting set {acting_set}") for osd_id in acting_set: rados_obj.change_osd_state(action="stop", target=osd_id) # sleeping for 5 seconds for re-balancing to begin time.sleep(5) # Waiting for cluster to get clean state after OSD stopped if not wait_for_clean_pg_sets(rados_obj): log.error("PG's in cluster are not active + Clean state.. ") return 1 rados_obj.change_osd_state(action="restart", target=osd_id) log.debug( f"Cluster reached clean state after osd {osd_id} stop and restart" ) rados_obj.change_recover_threads(config={}, action="rm") # deleting the pool created after the test rados_obj.detete_pool(pool=entry["pool_name"]) log.info( f"All the OSD's from the acting set {acting_set} were restarted " f"and object movement completed for pool {entry['pool_name']}") log.info( "Completed testing effects of large number of omap entries on pools ") return 0
def run(ceph_cluster, **kw): """ Performs various pool related validation tests Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) if config.get("ec_pool_recovery_improvement"): ec_config = config.get("ec_pool_recovery_improvement") if not rados_obj.create_erasure_pool(name="recovery", **ec_config): log.error("Failed to create the EC Pool") return 1 if not rados_obj.bench_write(**ec_config): log.error("Failed to write objects into the EC Pool") return 1 rados_obj.bench_read(**ec_config) log.info("Created the EC Pool, Finished writing data into the pool") # getting the acting set for the created pool acting_pg_set = rados_obj.get_pg_acting_set(pool_name=ec_config["pool_name"]) if len(acting_pg_set) != ec_config["k"] + ec_config["m"]: log.error( f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m" ) return 1 log.info(f" Acting set of the pool consists of OSD's : {acting_pg_set}") log.info( f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery" ) stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])] for osd_id in stop_osds: if not rados_obj.change_osd_state(action="stop", target=osd_id): log.error(f"Unable to stop the OSD : {osd_id}") return 1 log.info("Stopped 'm' number of OSD's from, starting to wait for recovery") rados_obj.change_recover_threads(config=ec_config, action="set") # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down time.sleep(25) # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000) while end_time > datetime.datetime.now(): flag = True status_report = rados_obj.run_ceph_command(cmd="ceph report") # Proceeding to check if all PG's are in active + clean for entry in status_report["num_pg_by_state"]: rec = ( "backfilling", "degraded", "incomplete", "recovering", "recovery_wait", "backfilling_wait", "peered", "undersized", ) if any(key in rec for key in entry["state"].split("+")): flag = False if flag: log.info("The recovery and back-filling of the OSD is completed") break log.info( f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()}," f"PG States : {status_report['num_pg_by_state']}" f" checking status again in 1 minute" ) time.sleep(60) # getting the acting set for the created pool after recovery acting_pg_set = rados_obj.get_pg_acting_set(pool_name=ec_config["pool_name"]) if len(acting_pg_set) != ec_config["k"] + ec_config["m"]: log.error( f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m" ) return 1 log.info(f" Acting set of the pool consists of OSD's : {acting_pg_set}") # Changing recovery threads back to default rados_obj.change_recover_threads(config=ec_config, action="rm") log.debug("Starting the stopped OSD's") for osd_id in stop_osds: if not rados_obj.change_osd_state(action="restart", target=osd_id): log.error(f"Unable to restart the OSD : {osd_id}") return 1 # Sleep for 5 seconds for OSD's to join the cluster time.sleep(5) if not flag: log.error("The pool did not reach active + Clean state after recovery") return 1 # Deleting the pool created if not rados_obj.detete_pool(pool=ec_config["pool_name"]): log.error(f"the pool {ec_config['pool_name']} could not be deleted") return 1 log.info("Successfully tested EC pool recovery with K osd's surviving") return 0
class PoolFunctions: """ Contains various functions that help in altering the behaviour, working of pools and verify the changes """ def __init__(self, node: CephAdmin): """ initializes the env to run rados commands Args: node: CephAdmin object """ self.rados_obj = RadosOrchestrator(node=node) def verify_target_ratio_set(self, pool_name, ratio): """ Sets the "target_size_ratio" on the given pool and verifies it from the auto-scale status Args: pool_name: name of the pool ratio: ratio to be set Returns: True -> pass, False -> fail """ log.debug(f"Setting ratio: {ratio} on pool: {pool_name}") self.rados_obj.set_pool_property(pool=pool_name, props="target_size_ratio", value=ratio) # sleeping for 2 seconds for pg autoscaler updates the status and new PG's time.sleep(2) ratio_set = self.get_pg_autoscaler_value(pool_name, item="target_ratio") if not ratio_set == ratio: log.error("specified target ratio not set on the pool") return False return True def get_pg_autoscaler_value(self, pool_name, item): """ Fetches the target ratio set on the pool given Args: pool_name: name of the pool item: Value of the item to be fetched. Allowed values: actual_capacity_ratio|actual_raw_used|bias|capacity_ratio|crush_root_id|target_bytes| effective_target_ratio|logical_used|pg_autoscale_mode|pg_num_target|pool_id|raw_used|target_ratio| Returns: Requested value """ cmd = "ceph osd pool autoscale-status" autoscale_status = self.rados_obj.run_ceph_command(cmd=cmd) try: pool_details = [ details for details in autoscale_status if details["pool_name"] == pool_name ][0] except Exception: log.error("Pool not found") return pool_details[item] def fill_omap_entries(self, pool_name, **kwargs): """ creates key-value entries for objects on ceph pools and increase the omap entries on the pool eg : if obj_start, obj_end: 0, 3000 objects, with num_keys 1000, the method would create 3000 objects with 1k KW pairs each. so total 3000*1000 KW entries Args: pool_name: name of the pool where the KW pairs needed to be added to objects **kwargs: other args that can be passed Valid args: 1. obj_start: start count for object creation 2. obj_end : end count for object creation 3. num_keys_obj: Number of KW paris to be added to each object Returns: True -> pass, False -> fail """ # Getting the client node to perform the operations client_node = self.rados_obj.ceph_cluster.get_nodes(role="client")[0] obj_start = kwargs.get("obj_start", 0) obj_end = kwargs.get("obj_end", 2000) num_keys_obj = kwargs.get("num_keys_obj", 20000) log.debug(f"Writing {(obj_end - obj_start) * num_keys_obj} Key paris" f" to increase the omap entries on pool {pool_name}") script_loc = "https://raw.githubusercontent.com/red-hat-storage/cephci/master/utility/generate_omap_entries.py" client_node.exec_command( sudo=True, cmd=f"curl -k {script_loc} -O", ) # Setup Script pre-requisites : docopt client_node.exec_command(sudo=True, cmd="pip3 install docopt", long_running=True) cmd_options = f"--pool {pool_name} --start {obj_start} --end {obj_end} --key-count {num_keys_obj}" cmd = f"python3 generate_omap_entries.py {cmd_options}" client_node.exec_command(sudo=True, cmd=cmd, long_running=True) # removing the py file copied client_node.exec_command(sudo=True, cmd="rm -rf generate_omap_entries.py") log.debug("Checking the amount of omap entries created on the pool") pool_stats = self.rados_obj.run_ceph_command( cmd="ceph df detail")["pools"] for detail in pool_stats: if detail["name"] == pool_name: pool_1_stats = detail["stats"] total_omap_data = pool_1_stats["omap_bytes_used"] omap_data = pool_1_stats["stored_omap"] break if omap_data < 0: log.error("No omap entries written into pool") return False log.info(f"Wrote {omap_data} bytes of omap data on the pool." f"Total stored omap data on pool : {total_omap_data}") return True def do_rados_delete(self, pool_name: str, pg_id: str = None): """ deletes all the objects from the given pool / PG ID Args: 1. pool_name: name of the pool 2. [ pg_id ]: Pg ID (Optional, but when provided, should be passed along with pool name ) Returns: True -> pass, False -> fail """ obj_cmd = f"rados -p {pool_name} ls" if pg_id: obj_cmd = f"rados --pgid {pg_id} ls" delete_obj_list = self.rados_obj.run_ceph_command(cmd=obj_cmd, timeout=1000) for obj in delete_obj_list: cmd = f"rados -p {pool_name} rm {obj['name']}" self.rados_obj.node.shell([cmd], long_running=True) # Sleeping for 3 seconds for object reference to be deleted time.sleep(3) # Checking if object is still present in the pool out = self.rados_obj.run_ceph_command(cmd=obj_cmd, timeout=1000) rem_objs = [obj["name"] for obj in out] if obj["name"] in rem_objs: log.error(f"Object {obj['name']} not deleted in the pool") return False log.debug(f"deleted object: {obj['name']} from pool {pool_name}") log.info(f"Completed deleting all objects from pool {pool_name}") return True def create_pool_snap(self, pool_name: str): """ Creates snapshots of the given pool Args: pool_name: name of the pool Returns: Pass -> name of the snapshot created, Fail -> False """ # Checking if snapshots can be created on the supplied pool cmd = "ceph osd dump" pool_status = self.rados_obj.run_ceph_command(cmd=cmd, timeout=800) for detail in pool_status["pools"]: if detail["pool_name"] != pool_name: continue if "selfmanaged_snaps" in detail["flags_names"]: # bz: https://bugzilla.redhat.com/show_bug.cgi?id=1425803#c2 log.error( f"Pool {pool_name} is a self managed pool, cannot create snaps manually" ) return False # Creating snaps on the pool provided cmd = "uuidgen" out, err = self.rados_obj.node.shell([cmd]) uuid = out[0:5] snap_name = f"{pool_name}-snap-{uuid}" cmd = f"ceph osd pool mksnap {pool_name} {snap_name}" self.rados_obj.node.shell([cmd], long_running=True) # Checking if snap was created successfully if not self.check_snap_exists(snap_name=snap_name, pool_name=pool_name): log.error("Snapshot of pool not created") return False log.debug(f"Created snapshot {snap_name} on pool {pool_name}") return snap_name def check_snap_exists(self, snap_name: str, pool_name: str) -> bool: """ checks the existence of the snapshot name given on the pool Args: snap_name: Name of the snapshot pool_name: Name of the pool Returns: True -> Snapshot exists, False -> snapshot does not exist """ snap_list = self.get_snap_names(pool_name=pool_name) return True if snap_name in snap_list else False def get_snap_names(self, pool_name: str) -> list: """ Fetches the list of snapshots created on the given pool Args: pool_name: name of the pool Returns: list of the snaps created """ cmd = "ceph osd dump" pool_status = self.rados_obj.run_ceph_command(cmd=cmd, timeout=800) for detail in pool_status["pools"]: if detail["pool_name"] == pool_name: snap_list = [snap["name"] for snap in detail["pool_snaps"]] log.debug(f"snapshots on pool : {snap_list}") return snap_list def delete_pool_snap(self, pool_name: str, snap_name: str = None) -> bool: """ deletes snapshots of the given pool. If no snap name is provided, deletes all the snapshots on the pool Args: pool_name: name of the pool snap_name: name of the snapshot Returns: Pass -> snapshot Deleted, Fail -> snapshot not Deleted """ if snap_name: delete_list = list(snap_name) else: delete_list = self.get_snap_names(pool_name=pool_name) # Deleting snaps on the pool provided for snap in delete_list: cmd = f"ceph osd pool rmsnap {pool_name} {snap}" self.rados_obj.node.shell([cmd]) # Checking if snap was deleted successfully if self.check_snap_exists(snap_name=snap_name, pool_name=pool_name): log.error("Snapshot of pool exists") return False log.debug(f"deleted snapshot {snap} on pool {pool_name}") log.debug("Deleted provided snapshots on the pool") return True def get_bulk_details(self, pool_name: str) -> bool: """ Checks the status of bulk flag on the pool given Args: pool_name: Name of the pool Returns: True -> pass, False -> fail """ # Checking if the sent pool already exists. if pool_name not in self.rados_obj.list_pools(): log.error(f"Pool {pool_name} does not exist") return False # Getting the bulk status obj = self.rados_obj.get_pool_property(pool=pool_name, props="bulk") return obj["bulk"] def set_bulk_flag(self, pool_name: str) -> bool: """ Sets the bulk flag to true on existing pools Args: pool_name: Name of the pool Returns: True -> pass, False -> fail """ # Checking if the sent pool already exists. If does not, creating new pool if pool_name not in self.rados_obj.list_pools(): log.info( f"Pool {pool_name} does not exist, creating new pool with bulk enabled" ) if not self.rados_obj.create_pool(pool_name=pool_name, bulk=True): log.error("Failed to create the replicated Pool") return False # Enabling bulk on already existing pool if not self.rados_obj.set_pool_property( pool=pool_name, props="bulk", value="true"): log.error(f"Could not set the bulk flag on pool {pool_name}") return False # Sleeping for 2 seconds after pool create/Modify for PG's to be calculated with bulk time.sleep(2) # Checking if the bulk is enabled or not return self.get_bulk_details(pool_name=pool_name) def rm_bulk_flag(self, pool_name: str) -> bool: """ Removes the bulk flag on existing pools Args: pool_name: Name of the pool Returns: True -> pass, False -> fail """ # Checking if the sent pool already exists. if pool_name not in self.rados_obj.list_pools(): log.info(f"Pool {pool_name} does not exist") return False # Enabling bulk on already existing pool if not self.rados_obj.set_pool_property( pool=pool_name, props="bulk", value="false"): log.error(f"Could not unset the bulk flag on pool {pool_name}") return False # Sleeping for 2 seconds after pool create/Modify for PG's to be calculated with bulk time.sleep(2) # Checking if the bulk is enabled or not return not self.get_bulk_details(pool_name=pool_name) def get_target_pg_num_bulk_flag(self, pool_name: str) -> int: """ Fetches the target PG counts for the given pool from the autoscaler status Args: pool_name: Name of the pool Returns: PG Count """ # Checking the autoscaler status, final PG counts, bulk flags cmd = "ceph osd pool autoscale-status" pool_status = self.rados_obj.run_ceph_command(cmd=cmd) for entry in pool_status: if entry["pool_name"] == pool_name: return int(entry["pg_num_final"])
def run(ceph_cluster, **kw) -> int: """ Test to copy data from one pool to another Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) pool_obj = PoolFunctions(node=cephadm) client_node = ceph_cluster.get_nodes(role="client")[0] pool_configs_path = "conf/pacific/rados/test-confs/pool-configurations.yaml" with open(pool_configs_path, "r") as fd: pool_configs = yaml.safe_load(fd) pool_orig = pool_configs[config["pool-1-type"]][config["pool-1-conf"]] pool_target = pool_configs[config["pool-2-type"]][config["pool-2-conf"]] create_given_pool(rados_obj, pool_orig) create_given_pool(rados_obj, pool_target) # Writing objects with huge omap entries if not pool_obj.fill_omap_entries(pool_name=pool_orig["pool_name"], obj_end=500): log.error( f"Omap entries not generated on pool {pool_orig['pool_name']}") return 1 do_rados_put(mon=client_node, pool=pool_orig["pool_name"], nobj=1000) snapshots = [] for _ in range(5): snap = pool_obj.create_pool_snap(pool_name=pool_orig["pool_name"]) if snap: snapshots.append(snap) else: log.error("Could not create snapshot on the pool") return 1 # Using cppool to copy contents b/w the pools cmd = f"rados cppool {pool_orig['pool_name']} {pool_target['pool_name']}" client_node.exec_command(sudo=True, cmd=cmd, long_running=True) # Sleeping for 2 seconds after copy to perform get operations time.sleep(2) do_rados_get(client_node, pool_target["pool_name"], 1) # Checking if the snapshots of pool was also copied # Snapshots of pool should not be copied for snap_name in snapshots: if pool_obj.check_snap_exists(snap_name=snap_name, pool_name=pool_target["pool_name"]): log.error("Snapshot of pool exists") return 1 # deleting the Target pool created after cppool rados_obj.detete_pool(pool=pool_target["pool_name"]) # Creating new target pool to test import/export create_given_pool(rados_obj, pool_target) # Creating temp file to hold pool info client_node.exec_command(cmd="touch /tmp/file", ) # crating export of data on old pool cmd = f"rados export -p {pool_orig['pool_name']} /tmp/file" client_node.exec_command(sudo=True, cmd=cmd, long_running=True) # Importing the file into the new pool cmd = f"rados import -p {pool_target['pool_name']} /tmp/file" client_node.exec_command(sudo=True, cmd=cmd, long_running=True) # Sleeping for 2 seconds after copy to perform get operations time.sleep(2) do_rados_get(client_node, pool_target["pool_name"], 1) # Checking if the snapshots of pool was also copied # Snapshots of pool should not be copied for snap_name in snapshots: if pool_obj.check_snap_exists(snap_name=snap_name, pool_name=pool_target["pool_name"]): log.error("Snapshot of pool exists") return 1 # deleting the Original & Target pool created after cppool rados_obj.detete_pool(pool=pool_target["pool_name"]) rados_obj.detete_pool(pool=pool_orig["pool_name"]) return 0
def run(ceph_cluster, **kw): """ enables connectivity mode and deploys stretch cluster with arbiter mon node Args: ceph_cluster (ceph.ceph.Ceph): ceph cluster """ log.info("Deploying stretch cluster with arbiter mon node") log.info(run.__doc__) config = kw.get("config") cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) mon_obj = MonElectionStrategies(rados_obj=rados_obj) client_node = ceph_cluster.get_nodes(role="client")[0] site1_name = config["site1"]["name"] site2_name = config["site2"]["name"] # disabling automatic crush update cmd = "ceph config set osd osd_crush_update_on_start false" cephadm.shell([cmd]) # Sleeping for 2 seconds after map update. time.sleep(2) # Setting the election strategy to connectivity mode if not mon_obj.set_election_strategy(mode="connectivity"): log.error("could not set election strategy to connectivity mode") return 1 # Sleeping for 2 seconds after strategy update. time.sleep(2) # Checking updated election strategy in mon map strategy = mon_obj.get_election_strategy() if strategy != 3: log.error( f"cluster created election strategy other than connectivity, i.e {strategy}" ) return 1 log.info("Enabled connectivity mode on the cluster") # Creating new datacenter crush objects and moving under root/default for name in [site1_name, site2_name]: cmd = f"ceph osd crush add-bucket {name} datacenter" rados_obj.run_ceph_command(cmd) time.sleep(2) move_crush_item(cephadm, crush_obj=name, name="root", value="default") time.sleep(2) # Moving all the OSD and Mon daemons into respective sites sites = ["site1", "site2", "site3"] for site in sites: mon_hosts = [ host_obj.hostname for host_obj in ceph_cluster.get_nodes(role="mon") ] log.info(f"Mon hosts defined: {mon_hosts}") osd_hosts = [ host_obj.hostname for host_obj in ceph_cluster.get_nodes(role="osd") ] log.info(f"OSD hosts defined: {osd_hosts}") # Collecting hosts from each site and setting locations accordingly site_details = config[site] crush_name = site_details["name"] host_nodes = cephadm.cluster.get_nodes() for item in site_details["hosts"]: host = [ node for node in host_nodes if re.search(item, node.hostname) ][0] # Moving the mon daemons into site if host.hostname in mon_hosts: cmd = f"ceph mon set_location {host.hostname} datacenter={crush_name}" cephadm.shell([cmd]) log.info( f"Set location for mon {host.hostname} onto site {crush_name}\n" "sleeping for 5 seconds") time.sleep(5) # Moving the osd daemons into site if host.hostname in osd_hosts: move_crush_item( node=cephadm, crush_obj=host.hostname, name="datacenter", value=crush_name, ) log.info( f"Set location for OSD {host.hostname} onto site {crush_name}\n" "sleeping for 5 seconds") time.sleep(5) log.info("Moved all the hosts into respective sites") stretch_rule_name = config.get("stretch_rule_name", "stretch_rule") if not setup_crush_rule( node=client_node, rule_name=stretch_rule_name, site1=site1_name, site2=site2_name, ): log.error("Failed to Add crush rules in the crush map") return 1 # Sleeping for 5 sec for the strategy to be active time.sleep(5) # Enabling the stretch cluster mode tiebreaker_node = get_node_by_id(cephadm.cluster, config["site3"]["hosts"][0]) log.info(f"tiebreaker node provided: {tiebreaker_node.hostname}") cmd = f"ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter" try: cephadm.shell([cmd]) except Exception as err: log.error( f"Error while enabling stretch rule on the datacenter. Command : {cmd}" ) log.error(err) return 1 time.sleep(2) # wait for PG's to settle down with new crush rules after deployment of stretch mode wait_for_clean_pg_sets(rados_obj) # Checking if the pools have been updated with the new crush rules acting_set = rados_obj.get_pg_acting_set() if len(acting_set) != 4: log.error( f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4" ) return 1 log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG") log.info("Stretch rule with arbiter monitor node set up successfully") return 0
def run(ceph_cluster, **kw): """ Test to Verify the pg-autoscale flag. Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) mon_obj = MonConfigMethods(rados_obj=rados_obj) pool_configs_path = "conf/pacific/rados/test-confs/pool-configurations.yaml" regex = r"\s*(\d.\d)-rhel-\d" build = (re.search(regex, config.get("build", config.get("rhbuild")))).groups()[0] if not float(build) > 5.0: log.info( "Test running on version less than 5.1, skipping verifying autoscaler flags" ) return 0 # Setting the no-autoscale flag cmd = "ceph osd pool set noautoscale" rados_obj.run_ceph_command(cmd=cmd) # sleeping for 5 seconds as the command takes some time to affect the status of pools time.sleep(5) # Getting the autoscale configurations after setting the flag # all the pools should have autoscale set to off cmd = "ceph osd pool autoscale-status" pool_status = rados_obj.run_ceph_command(cmd=cmd) for entry in pool_status: if entry["pg_autoscale_mode"] == "on": log.error( f"Pg autoscaler not turned off for pool : {entry['pool_name']}" ) return 1 if not mon_obj.verify_set_config(section="global", name="osd_pool_default_pg_autoscale_mode", value="off"): log.error( "Default autoscale mode not set to off upon setting the no-autoscale flag" ) return 1 if not mon_obj.verify_set_config( section="mgr", name="mgr/pg_autoscaler/noautoscale", value="true"): log.error( "autoscale Flag not set to true upon setting the no-autoscale flag" ) return 1 # Creating a new pool, with the flag off, new pool should be created with autoscaler profile turned off with open(pool_configs_path, "r") as fd: pool_configs = yaml.safe_load(fd) pool_conf = pool_configs["replicated"]["sample-pool-2"] create_given_pool(rados_obj, pool_conf) cmd = "ceph osd pool autoscale-status" pool_status = rados_obj.run_ceph_command(cmd=cmd) for entry in pool_status: if entry["pool_name"] == pool_conf["pool_name"]: if entry["pg_autoscale_mode"] == "on": log.error( f"Pg autoscaler not turned off for the new pool : {entry['pool_name']} " f"created with flag turned off") return 1 # Turning the autoscale flag back on. All the setting made earlier should be reverted cmd = "ceph osd pool unset noautoscale" pool_status = rados_obj.run_ceph_command(cmd=cmd) # sleeping for 5 seconds as the command takes some time to affect the status of pools time.sleep(5) for entry in pool_status: if entry["pg_autoscale_mode"] == "off": log.error( f"Pg autoscaler not turned on for pool : {entry['pool_name']}") return 1 if not mon_obj.verify_set_config(section="global", name="osd_pool_default_pg_autoscale_mode", value="on"): log.error( "Default autoscale mode not set to true upon removing the no-autoscale flag" ) return 1 if not mon_obj.verify_set_config(section="mgr", name="mgr/pg_autoscaler/noautoscale", value="false"): log.error( "autoscale Flag not set to false upon removing the no-autoscale flag" ) return 1 # Deleting the pool created earlier if not rados_obj.detete_pool(pool=pool_conf["pool_name"]): log.error(f"the pool {pool_conf['pool_name']} could not be deleted") return 1 log.info("Autoscale flag is working as expected.") return 0
def run(ceph_cluster, **kw): """ Automates OSD re-balance test scenarios. 1. Create replicated and/or erasure pool/pools 2. Identify the first osd to be removed 3. Fetch the host by daemon_type=osd and osd id 4. Fetch container id and device path 5. Mark osd out and wait for pgs to be active+clean 6. Remove OSD 7. Zap device and wait for device not present 8. Identify the second osd to be removed 9. Fetch the host by daemon_type=osd and osd id 10. Fetch container id and device path 11. Mark osd out 12. Add first osd and wait for device present and pgs to be active+clean """ try: log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) client_node = ceph_cluster.get_nodes(role="client")[0] log.info("Running osd in progress rebalance tests") pool = create_pools(config, rados_obj, client_node) should_not_be_empty(pool, "Failed to retrieve pool details") write_to_pools(config, rados_obj, client_node) rados_obj.change_recover_threads(config=pool, action="set") acting_pg_set = rados_obj.get_pg_acting_set( pool_name=pool["pool_name"]) log.info(f"Acting set {acting_pg_set}") should_not_be_empty(acting_pg_set, "Failed to retrieve acting pg set") osd_id = acting_pg_set[0] host = rados_obj.fetch_host_node(daemon_type="osd", daemon_id=osd_id) should_not_be_empty(host, "Failed to fetch host details") dev_path = get_device_path(host, osd_id) log.debug( f"osd1 device path : {dev_path}, osd_id : {osd_id}, host.hostname : {host.hostname}" ) utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=True) method_should_succeed(utils.set_osd_out, ceph_cluster, osd_id) method_should_succeed(wait_for_clean_pg_sets, rados_obj) utils.osd_remove(ceph_cluster, osd_id) method_should_succeed(wait_for_clean_pg_sets, rados_obj) method_should_succeed(utils.zap_device, ceph_cluster, host.hostname, dev_path) method_should_succeed(wait_for_device, host, osd_id, action="remove") osd_id1 = acting_pg_set[1] host1 = rados_obj.fetch_host_node(daemon_type="osd", daemon_id=osd_id1) should_not_be_empty(host1, "Failed to fetch host details") dev_path1 = get_device_path(host1, osd_id1) log.debug( f"osd2 device path : {dev_path1}, osd_id : {osd_id1}, host.hostname : {host1.hostname}" ) method_should_succeed(utils.set_osd_out, ceph_cluster, osd_id1) utils.add_osd(ceph_cluster, host.hostname, dev_path, osd_id) method_should_succeed(wait_for_device, host, osd_id, action="add") method_should_succeed(wait_for_clean_pg_sets, rados_obj) acting_pg_set1 = rados_obj.get_pg_acting_set( pool_name=pool["pool_name"]) if len(acting_pg_set) != len(acting_pg_set1): log.error( f"Acting pg set count before {acting_pg_set} and after {acting_pg_set1} rebalance mismatched" ) return 1 if pool.get("rados_put", False): do_rados_get(client_node, pool["pool_name"], 1) utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=False) rados_obj.change_recover_threads(config=pool, action="rm") if config.get("delete_pools"): for name in config["delete_pools"]: method_should_succeed(rados_obj.detete_pool, name) log.info("deleted all the given pools successfully") return 0 except Exception as e: log.info(e) log.info(traceback.format_exc()) return 1
def run(ceph_cluster, **kw): """ enables connectivity mode and deploys stretch cluster with arbiter mon node Actions Performed: 1. Disables the automatic crush map update 2. Collects the OSD daemons in the cluster and split them into 2 sites. 3. If add capacity is selected, only half of the OSD's will be added to various sites initially. 4. Adds the stretch rule into crush map. 5. Adding monitors into the 2 sites. 6. Create a replicated pool and deploy stretch mode. 7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites ) 8. Check for the bump in election epochs throughout. 9. Check the acting set in PG for 4 OSD's. 2 from each site. Verifies bugs: [1]. https://bugzilla.redhat.com/show_bug.cgi?id=1937088 [2]. https://bugzilla.redhat.com/show_bug.cgi?id=1952763 Args: ceph_cluster (ceph.ceph.Ceph): ceph cluster """ log.info("Deploying stretch cluster with arbiter mon node") log.info(run.__doc__) config = kw.get("config") cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) mon_obj = MonElectionStrategies(rados_obj=rados_obj) client_node = ceph_cluster.get_nodes(role="client")[0] tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0] if not client_node and not tiebreaker_node: log.error( "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster" ) return 1 mon_state = get_mon_details(node=cephadm) if len(list(mon_state["monitors"])) < 5: log.error( f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}" ) return 1 osd_details = get_osd_details(node=cephadm) if len(osd_details.keys()) < 4: log.error( f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}" ) return 1 # Finding and Deleting any stray EC pools that might have been left on cluster pool_dump = rados_obj.run_ceph_command(cmd="ceph osd dump") for entry in pool_dump["pools"]: if entry["type"] != 1 and entry["crush_rule"] != 0: log.info( f"A non-replicated pool found : {entry['pool_name']}, proceeding to delete pool" ) if not rados_obj.detete_pool(pool=entry["pool_name"]): log.error( f"the pool {entry['pool_name']} could not be deleted") return 1 log.debug("No pools other than replicated found on cluster") # disabling automatic crush update cmd = "ceph config set osd osd_crush_update_on_start false" cephadm.shell([cmd]) site1 = config.get("site1", "site1") site2 = config.get("site2", "site2") # Collecting osd details and split them into Sita A and Site B sorted_osds = sort_osd_sites(all_osd_details=osd_details) site_a_osds = sorted_osds[0] site_b_osds = sorted_osds[1] if config.get("perform_add_capacity"): site_a_osds = sorted_osds[0][:(len(sorted_osds[0]) // 2)] site_b_osds = sorted_osds[1][:(len(sorted_osds[1]) // 2)] if not set_osd_sites( node=cephadm, osds=site_a_osds, site=site1, all_osd_details=osd_details, ): log.error("Failed to move the OSD's into sites") return 1 if not set_osd_sites( node=cephadm, osds=site_b_osds, site=site2, all_osd_details=osd_details, ): log.error("Failed to move the OSD's into sites") return 1 # collecting mon map to be compared after strtech cluster deployment stretch_rule_name = "stretch_rule" if not setup_crush_rule(node=client_node, rule_name=stretch_rule_name, site1=site1, site2=site2): log.error("Failed to Add crush rules in the crush map") return 1 # Setting the election strategy to connectivity mode if not mon_obj.set_election_strategy(mode="connectivity"): log.error("could not set election strategy to connectivity mode") return 1 # Sleeping for 5 sec for the strategy to be active time.sleep(5) init_mon_state = get_mon_details(node=cephadm) # Checking if mon elections happened after changing election strategy if mon_state["epoch"] > init_mon_state["epoch"]: log.error( "Election epoch not bumped up after setting the connectivity mode." ) return 1 # Checking updated election strategy in mon map strategy = mon_obj.get_election_strategy() if strategy != 3: log.error( f"cluster created election strategy other than connectivity, i.e {strategy}" ) return 1 log.info("Enabled connectivity mode on the cluster") log.info( f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3") if not set_mon_sites(node=cephadm, tiebreaker_node=tiebreaker_node, site1=site1, site2=site2): log.error("Failed to ad monitors into respective sites") return 1 # All the existing pools should be automatically changed with stretch rule. Creating a test pool pool_name = "test_pool_1" if not rados_obj.create_pool(pool_name=pool_name, pg_num=16): log.error("Failed to create the replicated Pool") return 1 log.info("Monitors added to respective sites. enabling stretch rule") cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter" try: cephadm.shell([cmd]) except Exception as err: log.error( f"Error while enabling stretch rule on the datacenter. Command : {cmd}" ) log.error(err) return 1 if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]: log.error("Election epoch not bumped up after Enabling strech mode") return 1 if config.get("perform_add_capacity"): pool_name = "test_stretch_pool" if not rados_obj.create_pool( pool_name=pool_name, crush_rule=stretch_rule_name, ): log.error("Failed to create the replicated Pool") return 1 do_rados_put(mon=client_node, pool=pool_name, nobj=1000) # Increasing backfill/rebalance threads so that cluster will re-balance it faster after add capacity rados_obj.change_recover_threads(config=config, action="set") log.info( "Performing add Capacity after the deployment of stretch cluster") site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds] site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds] if not set_osd_sites( node=cephadm, osds=site_a_osds, site=site1, all_osd_details=osd_details, ): log.error("Failed to move the OSD's into sites") return 1 if not set_osd_sites( node=cephadm, osds=site_b_osds, site=site2, all_osd_details=osd_details, ): log.error("Failed to move the OSD's into sites") return 1 # Waiting for up to 2.5 hours for the PG's to enter active + Clean state after add capacity # Automation for bug : [1] & [2] end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000) while end_time > datetime.datetime.now(): flag = True status_report = rados_obj.run_ceph_command(cmd="ceph report") # Proceeding to check if all PG's are in active + clean for entry in status_report["num_pg_by_state"]: rec = ( "remapped", "backfilling", "degraded", "incomplete", "peering", "recovering", "recovery_wait", "peering", "undersized", "backfilling_wait", ) if any(key in rec for key in entry["state"].split("+")): flag = False if flag: log.info( "The recovery and back-filling of the OSD is completed") break log.info( f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()}," f"PG States : {status_report['num_pg_by_state']}" f" checking status again in 2 minutes") time.sleep(120) rados_obj.change_recover_threads(config=config, action="rm") if not flag: log.error( "The cluster did not reach active + Clean state after add capacity" ) return 1 with parallel() as p: p.spawn(do_rados_get, client_node, pool_name, 10) for res in p: log.info(res) # Checking if the pools have been updated with the new crush rules acting_set = rados_obj.get_pg_acting_set(pool_name=pool_name) if len(acting_set) != 4: log.error( f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4" ) return 1 log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG") log.info("Stretch rule with arbiter monitor node set up successfully") return 0