def change_config_for_slow_ops(node: CephAdmin, action: str, **kwargs): """ Changes few config Values on ceph cluster to intentionally increase changes of hitting slow_ops on the cluster network. Actions performed and rationale: * paxos_service_trim_min & paxos_service_trim_max set as mentioned in bz : https://bugzilla.redhat.com/show_bug.cgi?id=1943357#c0 * osd_op_complaint_time -> reducing the time threshold by which OSD should respond to requests * osd_max_backfills & osd_recovery_max_active -> Incrasing the number of threads for recovery & backfill as to reduce n/w bandwidth for client IO operations Args: node: Cephadm node where the commands need to be executed action: weather to set the Config or to remove it from cluster Values : "set" -> to set the config values "rm" -> to remove the config changes made kwargs: Any other optional args that need to be passed Returns: Exception in case of failures """ value_map = { "paxos_service_trim_min": kwargs.get("paxos_service_trim_min", 10), "paxos_service_trim_max": kwargs.get("paxos_service_trim_max", 100), "osd_op_complaint_time": kwargs.get("osd_op_complaint_time", 0.000001), "osd_max_backfills": kwargs.get("osd_max_backfills", 8), "osd_recovery_max_active": kwargs.get("osd_recovery_max_active", 10), } cmd_map = { "paxos_service_trim_min": f"ceph config {action} mon paxos_service_trim_min", "paxos_service_trim_max": f"ceph config {action} mon paxos_service_trim_max", "osd_op_complaint_time": f"ceph config {action} osd osd_op_complaint_time", "osd_max_backfills": f"ceph config {action} osd osd_max_backfills", "osd_recovery_max_active": f"ceph config {action} osd osd_recovery_max_active", } # Removing the config values set when action is to remove if action == "rm": for cmd in cmd_map.keys(): node.shell([cmd_map[cmd]]) return # Adding the config values for val in cmd_map.keys(): cmd = f"{cmd_map[val]} {value_map[val]}" node.shell([cmd]) # Verifying the values set in the config config_dump = run_ceph_command(node, cmd="ceph config dump") for val in cmd_map.keys(): for conf in config_dump: if conf["name"] == val: if float(conf["value"]) != float(value_map[val]): error = f"Values do not match for config {conf['name']}" raise TestBedSetupFailure(error)
def unmute_health_alert(alert: str, node: CephAdmin) -> bool: """ Un-Mutes the health alert on the cluster Args: alert: Name of the alert to be muted node: node on which command should be executed Returns: True -> pass, False -> failure """ all_alerts = get_alerts(node) if alert not in all_alerts["muted_alerts"] + all_alerts["active_alerts"]: log.info( f"the alert: {alert} not generated on the cluster, Cannot mute") return True if alert in all_alerts["active_alerts"]: log.info(f"the alert: {alert} is already un-muted") return True # Un-Muting the given alert cmd = f"ceph health unmute {alert}" node.shell([cmd]) # Sleeping for 2 sec for the alert to be logged time.sleep(2) all_alerts = get_alerts(node) log.info( f"Un-Muted the alert : {alert}. All the Un-muted alerts : {all_alerts['active_alerts']}" ) return True if alert in all_alerts["active_alerts"] else False
def set_osd_sites( node: CephAdmin, osds: list, site: str, all_osd_details: dict ) -> bool: """ Collects all the details about the OSD's present on the cluster and distrubutes them among the two sites Args: node: Cephadm node where the commands need to be executed osds: list of OSD's to be added to the given site site: the name of the site. all_osd_details: dictionary of OSD's containing the details eg : {'2': {'weight': 0.01459, 'state': 'up', 'name': 'osd.2'}, '7': {'weight': 0.01459, 'state': 'up', 'name': 'osd.7'}} Returns: True -> pass, False -> fail """ # adding the identified OSD's into the respective sites sites = set() sites.add(site) if len(sites) > 2: log.error("There can only be 2 Sites with stretch cluster at present") return False try: for osd in osds: cmd = f"ceph osd crush move {all_osd_details[osd]['name']} host=host-{site}-{osd} datacenter={site}" node.shell([cmd]) # sleeping for 20 seconds for osd to be moved time.sleep(20) except Exception: log.error("Failed to move the OSD's into Site A and Site B") return False cmd = "ceph osd tree" log.info(node.shell([cmd])) return True
def get_pg_acting_set(node: CephAdmin, pool_name: str) -> list: """ Fetches the PG details about the given pool and then returns the acting set of OSD's from sample PG of the pool Args: node: Cephadm node where the commands need to be executed pool_name: name of the pool whose one of the acting OSD set is needed Returns: list osd's part of acting set eg : [3,15,20] """ # Collecting details about the cluster cmd = "ceph osd dump --format=json" out, err = node.shell([cmd]) res = json.loads(out) for val in res["pools"]: if val["pool_name"] == pool_name: pool_id = val["pool"] break # Collecting the details of the 1st PG in the pool <ID>.0 pg_num = f"{pool_id}.0" cmd = f"ceph pg map {pg_num} --format=json" out, err = node.shell([cmd]) res = json.loads(out) return res["up"]
def set_mon_sites(node: CephAdmin, tiebreaker_node, site1: str, site2: str) -> bool: """ Adds the mon daemons into the two sites with arbiter node at site 3 as a tie breaker Args: node: Cephadm node where the commands need to be executed tiebreaker_node: name of the monitor to be added as tie breaker( site 3 ) site1: Name the 1st site site2: Name of the 2nd site Returns: True -> pass, False -> fail """ # Collecting the mon details mon_state = get_mon_details(node=node) monitors = list(mon_state["monitors"]) monitors.remove(tiebreaker_node.hostname) commands = [ f"/bin/ceph mon set_location {tiebreaker_node.hostname} datacenter=arbiter", f"/bin/ceph mon set_location {monitors[0]} datacenter={site1}", f"/bin/ceph mon set_location {monitors[1]} datacenter={site1}", f"/bin/ceph mon set_location {monitors[2]} datacenter={site2}", f"/bin/ceph mon set_location {monitors[3]} datacenter={site2}", ] for cmd in commands: try: node.shell([cmd]) except Exception as err: log.error(err) return False # Sleeping till mon restarts with new site info and rejoin the mon quorum if not wait_for_alert(node=node, alert="MON_DOWN", duration=180): log.error("mon down after adding to site after waiting 180 seconds") return False log.info("Added all the mon nodes into respective sites") return True
def detete_pool(node: CephAdmin, pool: str) -> bool: """ Deletes the given pool from the cluster Args: node: Cephadm node where the commands need to be executed pool: name of the pool to be deleted Returns: True -> pass, False -> fail """ # Checking if config is set to allow pool deletion config_dump = run_ceph_command(node, cmd="ceph config dump") if "mon_allow_pool_delete" not in [conf["name"] for conf in config_dump]: cmd = "ceph config set mon mon_allow_pool_delete true" node.shell([cmd]) existing_pools = run_ceph_command(node, cmd="ceph df") if pool not in [ele["name"] for ele in existing_pools["pools"]]: log.error(f"Pool:{pool} does not exist on cluster, cannot delete") return True cmd = f"ceph osd pool delete {pool} {pool} --yes-i-really-really-mean-it" node.shell([cmd]) existing_pools = run_ceph_command(node, cmd="ceph df") if pool not in [ele["name"] for ele in existing_pools["pools"]]: log.info(f"Pool:{pool} deleted Successfully") return True log.error(f"Pool:{pool} could not be deleted on cluster") return False
def reweight_crush_items(node: CephAdmin, **kwargs) -> bool: """ Performs Re-weight of various CRUSH items, based on key-value pairs sent Args: node: Cephadm node where the commands need to be executed **kwargs: Arguments for the commands Returns: True -> pass, False -> fail """ """Not returning false as I am not verifying the result of this. ( PG would be redistributed based whether we increase or decrease the weight. ) PG movement is slow and will take time based on no of objects. Need to implement this check. So for now assuming it worked and returning true.""" if kwargs.get("name"): name = kwargs["name"] weight = kwargs["weight"] cmd = f"ceph osd crush reweight {name} {weight}" node.shell([cmd]) return True # if no params are provided, Doing the re-balance by utilization. # todo: implementing the checks to verify the behaviour of the re-weight commands cmd = r"ceph osd reweight-by-utilization" node.shell([cmd]) return True
def generate_health_alert(alert: str, node: CephAdmin, **kwargs) -> bool: """ Method to generate various health alerts Args: alert: name of the alert to be generated node: name of the installer node clear: Bool value which specifies if the given alert should be cleared kwargs: any other params that need to be sent for a particular alert Returns: True -> pass, False -> failure """ clear = kwargs.get("clear") if alert == "OSDMAP_FLAGS": try: flag = kwargs.get("flag") except KeyError: log.error(f"Flag not provided to generate health alert : {alert}") return False cmd = f"ceph osd set {flag}" if clear: cmd = f"ceph osd unset {flag}" try: node.shell([cmd]) log.debug(f"{flag} set") except Exception: log.error(f"Failed to set the osd flag {flag}") log.error(traceback.format_exc()) return False # Sleeping for 5 seconds for the error to logged by cluster time.sleep(5) return True if alert == "MON_DISK_BIG": cmd = "ceph config set global mon_data_size_warn 2500000" if clear: cmd = "ceph config set global mon_data_size_warn 16106127360" try: node.shell([cmd]) log.debug("changed the mon data warn size param") except Exception: log.error("Failed to change the mon data warn size") log.error(traceback.format_exc()) return False # Sleeping for 5 seconds for the error to logged by cluster time.sleep(5) return True log.error(f"method not implemented to generate the alert : {alert}") return False
def set_cluster_configuration_checks(node: CephAdmin, **kwargs) -> bool: """ Sets up Cephadm to periodically scan each of the hosts in the cluster, and to understand the state of the OS, disks, NICs etc ref doc : https://docs.ceph.com/en/latest/cephadm/operations/#cluster-configuration-checks Args: node: Cephadm node where the commands need to be executed kwargs: Any other param that needs to passed Returns: True -> pass, False -> fail """ cmd = "ceph config set mgr mgr/cephadm/config_checks_enabled true" node.shell([cmd]) # Checking if the checks are enabled on cluster cmd = "ceph cephadm config-check status" out, err = node.shell([cmd]) if not re.search("Enabled", out): log.error("Cluster config checks no t enabled") return False if kwargs.get("disable_check_list"): for check in kwargs.get("disable_check_list"): cmd = f"ceph cephadm config-check disable {check}" node.shell([cmd]) if kwargs.get("enable_check_list"): for check in kwargs.get("enable_check_list"): cmd = f"ceph cephadm config-check enable {check}" node.shell([cmd]) cmd = "ceph cephadm config-check ls" log.info(node.shell([cmd])) return True
def run(ceph_cluster, **kw): """ Cephadm Bootstrap Args: ceph_cluster (ceph.ceph.Ceph): Ceph cluster object kw: test data - Bootstrap cluster with default or custom image and returns after cephadm.bootstrap. To use default image, set 'registry'. Example: config: command: bootstrap base_cmd_args: verbose: true args: custom_image: true | false mon-ip: <node_name> mgr-id: <mgr_id> fsid: <id> """ config = kw.get("config") build = config.get("build", config.get("rhbuild")) ceph_cluster.rhcs_version = build config["overrides"] = kw.get("test_data", {}).get("custom-config") # Manage Ceph using ceph-admin orchestration command = config.pop("command") service = config.pop("service", "") log.info("Executing %s %s" % (service, command)) instance = CephAdmin(cluster=ceph_cluster, **config) if "shell" in command: instance.shell(args=config["args"]) return 0 try: method = fetch_method(instance, command) out, err = method(config) # Verification of arguments # bootstrap response through stdout & stderr are combined here # currently console response coming through stderr. args = config.get("args", {}) verify_bootstrap(instance, args, out + err) finally: # Get cluster state get_cluster_state(instance) return 0
def get_slow_ops_data(node: CephAdmin, installer, action) -> bool: """ Checks the operations running on the cluster Args: node: node: Cephadm node where the commands need to be executed installer: Name of the node where cephadm shell / Mon daemon are collocated action: Specifies weather to check for current operations or historic operations Values : "current" -> Checks for operations that are on going in cluster. "historic" -> Operations that are completed and marked done by Monitor Returns: False if there are any running ops on the cluster, else True """ # checking if any ops are currently present if action == "current": cmd = f" ceph daemon mon.{installer.hostname} ops -f json" out, err = node.shell([cmd]) status = json.loads(out) log.info(status) if status["num_ops"] >= 1: log.error( f"There are operations on going on the cluster. Number : {status['num_ops']}" ) for op in status["ops"]: log.error( f"{op['description']} generated : {(op['type_data']['info'])}" ) return False # Checking all the ops reports historically elif action == "historic": cmd = f"ceph daemon mon.{installer.hostname} dump_historic_ops -f json" out, err = node.shell([cmd]) details = json.loads(out) size = details["size"] if size < 1: log.error("No slow operations generated on the cluster") return True total_dur = details["duration"] ops = details["ops"] log.info( f"No of slow_ops recorded : {size} for a total duration of {total_dur}\n" f"Slow ops generated for below items : \n") for op in ops: log.info( f"{op['description']} generated : {(op['type_data']['info'])}") return True
def run(ceph_cluster, **kw): """Ceph-admin module to manage ceph-dashboard service. check ceph.ceph_admin.dashboard for test config. Args: ceph_cluster (ceph.ceph.Ceph): Ceph cluster object. kw: keyword arguments from test data. Returns: value 0 on success. """ log.info("Running Ceph-admin Dashboard test") config = kw.get("config") build = config.get("build", config.get("rhbuild")) ceph_cluster.rhcs_version = build # Manage Ceph using ceph-admin orchestration command = config.pop("command") log.info("Executing dashboard %s operation" % command) instance = CephAdmin(cluster=ceph_cluster, **config) try: method = fetch_method(dashboard, command) method(instance, config.get("args")) finally: # Get cluster state get_cluster_state(instance) return 0
def run(ceph_cluster, **kw): """ Prepares the cluster & runs rados Customer Scenarios. Args: ceph_cluster (ceph.ceph.Ceph): ceph cluster kw: Args that need to be passed to the test for initialization Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw.get("config") cephadm = CephAdmin(cluster=ceph_cluster, **config) if config.get("mondb_trim_config"): db_config = config.get("mondb_trim_config") try: verify_mon_db_trim(ceph_cluster=ceph_cluster, node=cephadm, **db_config) log.info("Mon DB is getting trimmed regularly") except (TestCaseFailureException, TestBedSetupFailure): log.error("Failed to verify mon db trimming") return 1 log.info("Completed running the customer Scenario(s)") return 0
def get_osd_details(node: CephAdmin) -> dict: """ collects details such as weight and state of all OSD's on the cluster Args: node: Cephadm node where the commands need to be executed Returns: Dict -> pass, False -> fail dict eg : {'2': {'weight': 0.01459, 'state': 'up', 'name': 'osd.2'}, '7': {'weight': 0.01459, 'state': 'up', 'name': 'osd.7'}} """ # Collecting all the OSD details cmd = "ceph osd tree" out, err = node.shell([cmd]) log.info(out) regex = r"(\d{1,})\s+[\w]*\s+([.\d]*)\s+(osd.\d{1,})\s+(\w*)" osd_dict = {} if re.search(regex, out): osds = re.findall(regex, out) for osd in osds: osd_dict[osd[0]] = { "weight": float(osd[1]), "state": osd[3], "name": osd[2], } else: log.error("No osd's were found on the system") return osd_dict
def configure_pg_autoscaler(node: CephAdmin, **kwargs) -> bool: """ Configures pg_Autoscaler as a global global parameter and on pools Args: node: Cephadm node where the commands need to be executed **kwargs: Any other param that needs to be set Returns: True -> pass, False -> fail """ if kwargs.get("enable"): mgr_modules = run_ceph_command(node, cmd="ceph mgr module ls") if "pg_autoscaler" not in mgr_modules["enabled_modules"]: cmd = "ceph mgr module enable pg_autoscaler" node.shell([cmd]) if kwargs.get("pool_name"): pool_name = kwargs.get("pool_name") pg_scale_value = kwargs.get("pg_autoscale_value", "on") cmd = f"ceph osd pool set {pool_name} pg_autoscale_mode {pg_scale_value}" node.shell([cmd]) if kwargs.get("default_mode"): default_mode = kwargs.get("default_mode") cmd = ( f"ceph config set global osd_pool_default_pg_autoscale_mode {default_mode}" ) node.shell([cmd]) cmd = "ceph osd pool autoscale-status -f json" log.info(node.shell([cmd])) return True
def move_crush_item(node: CephAdmin, crush_obj: str, name: str, value: str) -> None: """ Moves the specified crush object to the given location, provided by name/value Args: node: node where the commands need to be executed crush_obj: Name of the CRUSH object to be moved name: New CRUSH object type value: New CRUSH object location Returns: None """ cmd = f"ceph osd crush move {crush_obj} {name}={value}" try: node.shell([cmd]) time.sleep(2) except Exception as err: log.error(err)
def create_erasure_pool(node: CephAdmin, name: str, **kwargs) -> bool: """ Creates a erasure code profile and then creates a pool with the same Args: node: Cephadm node where the commands need to be executed name: Name of the profile to create **kwargs: Any other param that needs to be set in the EC profile Returns: True -> pass, False -> fail """ failure_domain = kwargs.get("crush-failure-domain", "osd") k = kwargs.get("k", 3) m = kwargs.get("m", 2) plugin = kwargs.get("plugin", "jerasure") pool_name = kwargs.get("pool_name") profile_name = f"ecprofile_{name}" # Creating a erasure coded profile with the options provided cmd = ( f"ceph osd erasure-code-profile set {profile_name}" f" crush-failure-domain={failure_domain} k={k} m={m} plugin={plugin}") try: node.shell([cmd]) except Exception as err: log.error(f"Failed to create ec profile : {profile_name}") log.error(err) return False cmd = f"ceph osd erasure-code-profile get {profile_name}" log.info(node.shell([cmd])) # Creating the pool with the profile created if not create_pool( node=node, ec_profile_name=profile_name, disable_pg_autoscale=True, **kwargs, ): log.error(f"Failed to create Pool {pool_name}") return False log.info(f"Created the ec profile : {profile_name} and pool : {pool_name}") return True
def mute_health_alert(alert: str, node: CephAdmin, duration: str = None, sticky: bool = False) -> bool: """ Mutes the health alert generated on the cluster Args: alert: Name of the alert to be muted node: node on which command should be executed duration: duration for which the alert should be muted. Allowed Values: None -> mutes the specified alert indefinitely until the same alert is raised again 5m, 1h -> mutes the specified alert for specified duration sticky: makes use of the "--sticky" param to mute specified alert indefinitely Returns: True -> pass, False -> failure """ all_alerts = get_alerts(node) if alert not in all_alerts["active_alerts"] + all_alerts["muted_alerts"]: log.info( f"the alert: {alert} not generated on the cluster, Cannot mute") return True if alert in all_alerts["muted_alerts"]: log.info(f"the alert: {alert} is already muted") return True # Muting the given alert along with specified duration cmd = f"ceph health mute {alert}" if duration: cmd += f" {duration}" if sticky: cmd += " --sticky" node.shell([cmd]) # Sleeping for 5 sec for the alert to be logged time.sleep(5) all_alerts = get_alerts(node) log.info( f"Muted the alert : {alert}. All the muted alerts : {all_alerts['muted_alerts']}" ) return True if alert in all_alerts["muted_alerts"] else False
def run_rados_bench_write(node: CephAdmin, pool_name: str, **kwargs) -> bool: """ Method to trigger Write operations via the Rados Bench tool Args: node: Cephadm node where the commands need to be executed pool_name: pool on which the operation will be performed kwargs: Any other param that needs to passed Returns: True -> pass, False -> fail """ duration = kwargs.get("rados_write_duration", 200) byte_size = kwargs.get("byte_size", 4096) cmd = f"sudo rados --no-log-to-stderr -b {byte_size} -p {pool_name} bench {duration} write --no-cleanup" try: node.shell([cmd]) return True except Exception as err: log.error(f"Error running rados bench write on pool : {pool_name}") log.error(err) return False
def set_logging_to_file(node: CephAdmin) -> bool: """ Enables the cluster logging into files at var/log/ceph and checks file permissions Args: node: Cephadm node where the commands need to be executed Returns: True -> pass, False -> fail """ try: cmd = "ceph config set global log_to_file true" node.shell([cmd]) cmd = "ceph config set global mon_cluster_log_to_file true" node.shell([cmd]) except Exception: log.error("Error while enabling config to log into file") return False # Sleeping for 10 seconds for files to be generated time.sleep(10) cmd = "ls -ll /var/log/ceph" out, err = node.shell([cmd]) log.info(out) regex = r"\s*([-rwx]*)\.\s+\d\s+([\w]*)\s+([\w]*)\s+[\w\s:]*(ceph[\w.]*log)" perm = "-rw-------" user = "******" files = ["ceph.log", "ceph.audit.log"] if re.search(regex, out): match = re.findall(regex, out) for val in match: if not (val[0] == perm and val[1] == user and val[2] == user): log.error( f"file permissions are not correct for file : {val[3]}") return False if val[3] in files: files.remove(val[3]) if files: log.error(f"Did not find the log files : {files}") return False return True
def change_recover_threads(node: CephAdmin, config: dict, action: str): """ increases or decreases the recovery threads based on the action sent Args: node: Cephadm node where the commands need to be executed config: Config from the suite file for the run action: Set or remove increase the backfill / recovery threads Values : "set" -> set the threads to specified value "rm" -> remove the config changes made """ cfg_map = { "osd_max_backfills": f"ceph config {action} osd osd_max_backfills", "osd_recovery_max_active": f"ceph config {action} osd osd_recovery_max_active", } for cmd in cfg_map: if action == "set": command = f"{cfg_map[cmd]} {config.get(cmd, 8)}" else: command = cfg_map[cmd] node.shell([command])
def run_rados_bench_read(node: CephAdmin, pool_name: str, **kwargs) -> bool: """ Method to trigger Read operations via the Rados Bench tool Args: node: Cephadm node where the commands need to be executed pool_name: pool on which the operation will be performed kwargs: Any other param that needs to passed Returns: True -> pass, False -> fail """ duration = kwargs.get("rados_read_duration", 80) try: cmd = f"rados --no-log-to-stderr -p {pool_name} bench {duration} seq" node.shell([cmd]) cmd = f"rados --no-log-to-stderr -p {pool_name} bench {duration} rand" node.shell([cmd]) return True except Exception as err: log.error(f"Error running rados bench write on pool : {pool_name}") log.error(err) return False
def create_pool(node: CephAdmin, pool_name: str, pg_num: int = 64, **kwargs) -> bool: """ Create a pool named from the pool_name parameter. Args: node: Cephadm node where the commands need to be executed pool_name: name of the pool being created. pg_num: initial number of pgs. kwargs: Any other args that need to be passed Returns: True -> pass, False -> fail """ log.info(f"creating pool_name {pool_name}") cmd = f"ceph osd pool create {pool_name} {pg_num} {pg_num}" if kwargs.get("ec_profile_name"): cmd = f"{cmd} erasure {kwargs['ec_profile_name']}" try: node.shell([cmd]) except Exception as err: log.error(f"Error creating pool : {pool_name}") log.error(err) return False # Enabling rados application on the pool enable_app_cmd = f"sudo ceph osd pool application enable {pool_name} {kwargs.get('app_name','rados')}" node.shell([enable_app_cmd]) cmd_map = { "min_size": f" ceph osd pool set {pool_name} min_size {kwargs.get('min_size')}", "size": f" ceph osd pool set {pool_name} size {kwargs.get('min_size')}", "erasure_code_use_overwrites": f"ceph osd pool set {pool_name} " f"allow_ec_overwrites {kwargs.get('erasure_code_use_overwrites')}", "disable_pg_autoscale": f"ceph osd pool set {pool_name} pg_autoscale_mode off", "crush_rule": f"sudo ceph osd pool set {pool_name} crush_rule {kwargs.get('crush_rule')}", "pool_quota": f"ceph osd pool set-quota {pool_name} {kwargs.get('pool_quota')}", } for key in kwargs: if cmd_map.get(key): try: node.shell([cmd_map[key]]) except Exception as err: log.error( f"Error setting the property : {key} for pool : {pool_name}" ) log.error(err) return False log.info(f"Created pool {pool_name} successfully") return True
def run(ceph_cluster, **kw): """ Verifies the config change history in monitor configuration database changes Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) mon_obj = MonConfigMethods(rados_obj=rados_obj) # getting the last config change, to which we will roll back later init_config = mon_obj.get_ceph_log(count=1)[0] log.info("Config at the beginning of test. \n" f"Version: {init_config['version']}" f"Changes made: {init_config['changes']}") log.info( "Setting new changes and verifying if the changes are reflected in the log" ) if not mon_obj.set_config(section="osd", name="osd_max_scrubs", value="8"): log.error("Error setting config ") return 1 # Checking the versions and changes made. test_config = mon_obj.get_ceph_log(count=1)[0] log.info("Config changes made for test. \n" f"Version: {test_config['version']}" f"Changes made: {test_config['changes']}") if not test_config["version"] > init_config["version"]: log.error(f"The log is not updated with new config changes." f"Version: {test_config['version']}") return 1 try: name = test_config["changes"][0].get("name") value = str(test_config["changes"][0].get("new_value")) if not name == "osd/osd_max_scrubs" and value == "8": log.error(f"The log is not updated with new config changes." f"Changes made: {test_config['changes']}") return 1 except Exception: log.error( "The log collected does not contain the value and changes made") return 1 log.info("The ceph config log is successfully updated after changes ") return 0
def operator(test_config, step_config, **kw): """ Using the provided test config file, this method triggers SDK calls of RBD of that specific scenario Arguments: test_config: containing the key/value pairs passed from the test-suite step_config: arguments required for a specific operation args: test data Returns: 0 on success or 1 for failures """ if step_config.get("method") == "shell": cephadm = CephAdmin(kw["ceph_cluster_dict"], test_config) cephadm.shell(args=step_config["args"]) else: # maintain dictionary to map to classes based on service # instantiate class instance = CLASS_MAP[step_config["class"]](nodes=kw["ceph_nodes"]) method = getattr(instance, step_config["method"]) log.info(method) method(step_config["args"]) return 0
def run_ceph_command(node: CephAdmin, cmd: str) -> dict: """ Runs ceph commands with json tag for the action specified otherwise treats action as command and returns formatted output Args: node: Cephadm node where the commands need to be executed cmd: Command that needs to be run Returns: dictionary of the output """ cmd = f"{cmd} -f json" out, err = node.shell([cmd]) status = json.loads(out) return status
def enable_balancer(node: CephAdmin, **kwargs) -> bool: """ Enables the balancer module with the given mode Args: node: Cephadm node where the commands need to be executed kwargs: Any other args that need to be passed Returns: True -> pass, False -> fail """ # balancer is always enabled module, There is no need to enable the module via mgr. # To verify the same run ` ceph mgr module ls `, which would list all modules. # if found to be disabled, can be enabled by ` ceph mgr module enable balancer ` mgr_modules = run_ceph_command(node, cmd="ceph mgr module ls") if not ("balancer" in mgr_modules["always_on_modules"] or "balancer" in mgr_modules["enabled_modules"]): log.error(f"Balancer is not enabled. Enabled modules on cluster are:" f"{mgr_modules['always_on_modules']} & " f"{mgr_modules['enabled_modules']}") # Setting the mode for the balancer. Available modes: none|crush-compat|upmap balancer_mode = kwargs.get("balancer_mode", "upmap") cmd = f"ceph balancer mode {balancer_mode}" node.shell([cmd]) # Turning on the balancer on the system cmd = "ceph balancer on" node.shell([cmd]) # Sleeping for 10 seconds after enabling balancer and then collecting the evaluation status time.sleep(10) cmd = "ceph balancer status" try: op, err = node.shell([cmd]) log.info(op) return True except Exception: log.error("Exception hit while checking balancer status") return False
def get_alerts(node: CephAdmin) -> dict: """ Fetches all the current health alerts codes that are generated on the ceph cluster Args: node: node on which command should be executed Returns: list of the alerts present alert dictionary : { "active_alerts" : ['CEPHADM_REFRESH_FAILED', 'OSDMAP_FLAGS'], "muted_alerts" : ['MON_DISK_BIG'] } """ cmd = "ceph health detail" all_alerts = {} out, err = node.shell([cmd]) regex = r"(\(MUTED[\w\s,-]*\))?\s*\[\w{3}\]\s([\w_]*):" alerts = re.findall(regex, out) all_alerts["active_alerts"] = [alert[1] for alert in alerts if not alert[0]] all_alerts["muted_alerts"] = [alert[1] for alert in alerts if alert[0]] return all_alerts
def run(ceph_cluster, **kw): """ Verifies the config change reverts in monitor configuration database changes taken from logs Returns: 1 -> Fail, 0 -> Pass """ log.info(run.__doc__) config = kw["config"] cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_obj = RadosOrchestrator(node=cephadm) mon_obj = MonConfigMethods(rados_obj=rados_obj) init_config = mon_obj.get_ceph_log(count=1)[0] if not mon_obj.set_config( section="mon", name="mon_max_log_epochs", value="1000"): log.error("Error setting config ") return 1 log.info( f"Proceeding with reverting the last config change, selecting version: {init_config['version']}" ) if not mon_obj.ceph_config_reset(version=init_config["version"]): log.error( f"Could not revert to the selected version : {init_config['version']}" ) return 1 log.info( "Reverted to selected version. Checking if the config value is removed" ) if mon_obj.verify_set_config(section="mon", name="mon_max_log_epochs", value="1000"): log.error("Config is still set after the reset") return 1 test_config = mon_obj.get_ceph_log(count=1)[0] log.info( f"reverted successfully to previous versions. config log : {test_config}" ) log.info("The ceph config log is successfully updated after changes ") return 0
def get_mon_details(node: CephAdmin) -> dict: """ Collects the mon map details like election epoch, election strategy, active mons and fsid Args: node: Cephadm node where the commands need to be executed Returns: Dict -> pass, False -> fail dict eg : { 'epoch': '6', 'fsid': '00206990-70fb-11eb-a425-f0d4e2ebeb54', 'election_strategy': '1', 'monitors': ['mon.dell-r640-016.dsal.lab.eng.tlv2.redhat.com', 'mon.dell-r640-019'] } """ cmd = "ceph mon dump" mon_details = {} out, err = node.shell([cmd]) log.info(out) regex_details = ( r"\s*epoch\s+(\d{1,})\s+fsid\s+([\w-]*)[\w\W]*election_strategy:\s+(\d{1})" ) regex_mon = r"\d{1}\:\s+[\[\]\w\:\./,]*\s+mon\.([\w\-_\.]*)" details = re.search(regex_details, out).groups() mon_details["epoch"] = int(details[0]) mon_details["fsid"] = details[1] mon_details["election_strategy"] = int(details[2]) mon_details["monitors"] = re.findall(regex_mon, out) return mon_details