Beispiel #1
0
 def __init__(self, node: CephAdmin):
     """
     initializes the env to run rados commands
     Args:
         node: CephAdmin object
     """
     self.rados_obj = RadosOrchestrator(node=node)
def change_config_for_slow_ops(rados_obj: RadosOrchestrator, action: str,
                               **kwargs):
    """
    Changes few config Values on ceph cluster to intentionally increase changes of hitting slow_ops on
    the cluster network.
    Actions performed and rationale:
    * paxos_service_trim_min & paxos_service_trim_max set as mentioned in
    bz : https://bugzilla.redhat.com/show_bug.cgi?id=1943357#c0
    * osd_op_complaint_time -> reducing the time threshold by which OSD should respond to requests
    * osd_max_backfills & osd_recovery_max_active -> Incrasing the number of threads for recovery &
    backfill as to reduce n/w bandwidth for client IO operations
    Args:
        rados_obj: Rados object for command execution
        action: weather to set the Config or to remove it from cluster
                Values : "set" -> to set the config values
                         "rm" -> to remove the config changes made
        kwargs: Any other optional args that need to be passed
    Returns: Exception in case of failures
    """
    value_map = {
        "paxos_service_trim_min": kwargs.get("paxos_service_trim_min", 10),
        "paxos_service_trim_max": kwargs.get("paxos_service_trim_max", 100),
        "osd_op_complaint_time": kwargs.get("osd_op_complaint_time", 0.000001),
        "osd_max_backfills": kwargs.get("osd_max_backfills", 8),
        "osd_recovery_max_active": kwargs.get("osd_recovery_max_active", 10),
    }
    cmd_map = {
        "paxos_service_trim_min":
        f"ceph config {action} mon paxos_service_trim_min",
        "paxos_service_trim_max":
        f"ceph config {action} mon paxos_service_trim_max",
        "osd_op_complaint_time":
        f"ceph config {action} osd osd_op_complaint_time",
        "osd_max_backfills":
        f"ceph config {action} osd osd_max_backfills",
        "osd_recovery_max_active":
        f"ceph config {action} osd osd_recovery_max_active",
    }

    # Removing the config values set when action is to remove
    if action == "rm":
        for cmd in cmd_map.keys():
            rados_obj.node.shell([cmd_map[cmd]])
        return

    # Adding the config values
    for val in cmd_map.keys():
        cmd = f"{cmd_map[val]} {value_map[val]}"
        rados_obj.node.shell([cmd])

    # Verifying the values set in the config
    config_dump = rados_obj.run_ceph_command(cmd="ceph config dump")
    for val in cmd_map.keys():
        for conf in config_dump:
            if conf["name"] == val:
                if float(conf["value"]) != float(value_map[val]):
                    error = f"Values do not match for config {conf['name']}"
                    raise TestBedSetupFailure(error)
Beispiel #3
0
def run(ceph_cluster, **kw):
    """
    Verifies the config change history in monitor configuration database changes
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)

    # getting the last config change, to which we will roll back later
    init_config = mon_obj.get_ceph_log(count=1)[0]
    log.info("Config at the beginning of test. \n"
             f"Version: {init_config['version']}"
             f"Changes made: {init_config['changes']}")

    log.info(
        "Setting new changes and verifying if the changes are reflected in the log"
    )
    if not mon_obj.set_config(section="osd", name="osd_max_scrubs", value="8"):
        log.error("Error setting config ")
        return 1

    # Checking the versions and changes made.
    test_config = mon_obj.get_ceph_log(count=1)[0]
    log.info("Config changes made for test. \n"
             f"Version: {test_config['version']}"
             f"Changes made: {test_config['changes']}")

    if not test_config["version"] > init_config["version"]:
        log.error(f"The log is not updated with new config changes."
                  f"Version: {test_config['version']}")
        return 1
    try:
        name = test_config["changes"][0].get("name")
        value = str(test_config["changes"][0].get("new_value"))
        if not name == "osd/osd_max_scrubs" and value == "8":
            log.error(f"The log is not updated with new config changes."
                      f"Changes made: {test_config['changes']}")
            return 1
    except Exception:
        log.error(
            "The log collected does not contain the value and changes made")
        return 1

    log.info("The ceph config log is successfully updated after changes ")
    return 0
Beispiel #4
0
def wait_for_clean_pg_sets(rados_obj: RadosOrchestrator) -> bool:
    """
    Waiting for up to 2.5 hours for the PG's to enter active + Clean state after stretch changes
    Automation for bug : [1] & [2]
    Args:
        rados_obj: RadosOrchestrator object to run commands

    Returns:  True -> pass, False -> fail

    """
    end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
    while end_time > datetime.datetime.now():
        flag = True
        status_report = rados_obj.run_ceph_command(cmd="ceph report")

        # Proceeding to check if all PG's are in active + clean
        for entry in status_report["num_pg_by_state"]:
            rec = (
                "remapped",
                "backfilling",
                "degraded",
                "incomplete",
                "peering",
                "recovering",
                "recovery_wait",
                "undersized",
                "backfilling_wait",
            )
            if any(key in rec for key in entry["state"].split("+")):
                flag = False

        if flag:
            log.info("The recovery and back-filling of the OSD is completed")
            return True
        log.info(
            f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
            f"PG States : {status_report['num_pg_by_state']}"
            f" checking status again in 2 minutes"
        )
        time.sleep(120)

    log.error("The cluster did not reach active + Clean state")
    return False
Beispiel #5
0
def run(ceph_cluster, **kw):
    """
    Verifies the config change reverts in monitor configuration database changes taken from logs
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)

    init_config = mon_obj.get_ceph_log(count=1)[0]
    if not mon_obj.set_config(
            section="mon", name="mon_max_log_epochs", value="1000"):
        log.error("Error setting config ")
        return 1
    log.info(
        f"Proceeding with reverting the last config change, selecting version: {init_config['version']}"
    )
    if not mon_obj.ceph_config_reset(version=init_config["version"]):
        log.error(
            f"Could not revert to the selected version : {init_config['version']}"
        )
        return 1

    log.info(
        "Reverted to selected version. Checking if the config value is removed"
    )
    if mon_obj.verify_set_config(section="mon",
                                 name="mon_max_log_epochs",
                                 value="1000"):
        log.error("Config is still set after the reset")
        return 1

    test_config = mon_obj.get_ceph_log(count=1)[0]
    log.info(
        f"reverted successfully to previous versions. config log : {test_config}"
    )

    log.info("The ceph config log is successfully updated after changes ")
    return 0
Beispiel #6
0
def run(ceph_cluster, **kw):
    """
    Prepares the cluster to run rados tests.
    Actions Performed:
    1. Create a Replicated and Erasure coded pools and write Objects into pools
    2. Setup email alerts for sending errors/warnings on the cluster.
        Verifies Bugs:
        https://bugzilla.redhat.com/show_bug.cgi?id=1849894
        https://bugzilla.redhat.com/show_bug.cgi?id=1878145
    3. Enable logging into file and check file permissions
        Verifies Bug : https://bugzilla.redhat.com/show_bug.cgi?id=1884469
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
        kw: Args that need to be passed to the test for initialization

    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)
    out, err = cephadm.shell(["uuidgen"])
    uuid = out.split("-")[0]

    if config.get("ec_pool"):
        ec_config = config.get("ec_pool")
        ec_config.setdefault("pool_name", f"ecpool_{uuid}")
        if not rados_obj.create_erasure_pool(name=uuid, **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if ec_config.get("test_overwrites_pool"):
            if not rados_obj.verify_ec_overwrites(**ec_config):
                log.error("Failed to create the EC Pool")
                return 1
        else:
            if not rados_obj.bench_write(**ec_config):
                log.error("Failed to write objects into the EC Pool")
                return 1
            rados_obj.bench_read(**ec_config)
            log.info(
                "Created the EC Pool, Finished writing data into the pool")

        if ec_config.get("delete_pool"):
            if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
                log.error("Failed to delete EC Pool")
                return 1

    if config.get("replicated_pool"):
        rep_config = config.get("replicated_pool")
        rep_config.setdefault("pool_name", f"repool_{uuid}")
        if not rados_obj.create_pool(**rep_config, ):
            log.error("Failed to create the replicated Pool")
            return 1
        if not rados_obj.bench_write(**rep_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**rep_config)
        log.info(
            "Created the replicated Pool, Finished writing data into the pool")
        if rep_config.get("delete_pool"):
            if not rados_obj.detete_pool(pool=rep_config["pool_name"]):
                log.error("Failed to delete replicated Pool")
                return 1

    if config.get("set_pool_configs"):
        changes = config["set_pool_configs"]
        pool_name = changes["pool_name"]
        configurations = changes["configurations"]
        for conf in configurations.keys():
            if not rados_obj.set_pool_property(
                    pool=pool_name, props=conf, value=configurations[conf]):
                log.error(f"failed to set property {conf} on the cluster")
                return 1
        log.info(f"made the config changes on the pool {pool_name}")

    if config.get("email_alerts"):
        alert_config = config.get("email_alerts")
        if not rados_obj.enable_email_alerts(**alert_config):
            log.error("Error while configuring email alerts")
            return 1
        log.info("email alerts configured")

    if config.get("Verify_config_parameters"):
        test_config = config.get("Verify_config_parameters")
        test_node = ceph_cluster.get_nodes(role="osd")[0]
        for conf in test_config["configurations"]:
            for entry in conf.values():
                if entry.get("location_type") == "host":
                    entry["location_value"] = test_node.hostname
                if not mon_obj.set_config(**entry):
                    log.error(f"Error setting config {conf}")
                    return 1
        log.info("done")
        pool_name = "test_pool_1"
        if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
            log.error("Failed to create the replicated Pool")
            return 1

        rados_obj.bench_write(pool_name=pool_name, rados_write_duration=50)

        # Removing test configurations
        for conf in test_config["configurations"]:
            for entry in conf.values():
                if entry.get("location_type") == "host":
                    entry["location_value"] = test_node.hostname
                if not mon_obj.remove_config(**entry):
                    log.error(f"Error setting config {conf}")
                    return 1
        log.info("finished removing values, passed")

    if config.get("log_to_file"):
        if not rados_obj.enable_file_logging():
            log.error("Error while setting config to enable logging into file")
            return 1
        log.info("Logging to file configured")

    if config.get("cluster_configuration_checks"):
        cls_config = config.get("cluster_configuration_checks")
        if not rados_obj.set_cluster_configuration_checks(**cls_config):
            log.error("Error while setting Cluster config checks")
            return 1
        log.info("Set up cluster configuration checks")

    if config.get("configure_balancer"):
        balancer_config = config.get("configure_balancer")
        if not rados_obj.enable_balancer(**balancer_config):
            log.error("Error while setting up balancer on the Cluster")
            return 1
        log.info("Set up Balancer on the cluster")

    if config.get("configure_pg_autoscaler"):
        autoscaler_config = config.get("configure_pg_autoscaler")
        if not rados_obj.configure_pg_autoscaler(**autoscaler_config):
            log.error("Error while setting up pg_autoscaler on the Cluster")
            return 1
        log.info("Set up pg_autoscaler on the cluster")

    if config.get("enable_compression"):
        compression_conf = config["enable_compression"]
        pool_name = compression_conf["pool_name"]
        for conf in compression_conf["configurations"]:
            for entry in conf.values():
                if not rados_obj.pool_inline_compression(pool_name=pool_name,
                                                         **entry):
                    log.error(
                        f"Error setting compression on pool : {pool_name} for config {conf}"
                    )
                    return 1
                if not rados_obj.bench_write(**compression_conf):
                    log.error("Failed to write objects into Pool")
                    return 1
                rados_obj.bench_read(**compression_conf)
                log.info(
                    "Created the replicated Pool, Finished writing data into the pool"
                )
        log.info("Completed compression tests")

    if config.get("delete_pools"):
        for name in config["delete_pools"]:
            if not rados_obj.detete_pool(name):
                log.error(f"the pool {name} could not be deleted")
                return 1
        log.info("deleted all the given pools successfully")

    log.info("All Pre-requisites completed to run Rados suite")
    return 0
Beispiel #7
0
def run(ceph_cluster, **kw):
    """
    Automates OSD re-balance test scenarios.
    1. Create replicated and/or erasure pool/pools
    2. Identify the osd to be removed
    3. Fetch the host by daemon_type=osd and osd id
    4. Fetch container id and device path
    5. Mark osd out and wait for pgs to be active+clean
    6. Remove OSD
    7. Zap device and wait for device not present
    8. Add OSD and wait for device present and pgs to be active+clean
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    client_node = ceph_cluster.get_nodes(role="client")[0]

    log.info("Running create pool test case")
    if config.get("create_pools"):
        pools = config.get("create_pools")
        for each_pool in pools:
            cr_pool = each_pool["create_pool"]
            if cr_pool.get("pool_type", "replicated") == "erasure":
                method_should_succeed(rados_obj.create_erasure_pool,
                                      name=cr_pool["pool_name"],
                                      **cr_pool)
            else:
                method_should_succeed(rados_obj.create_pool,
                                      pool_name=cr_pool["pool_name"],
                                      **cr_pool)
            method_should_succeed(rados_obj.bench_write, **cr_pool)
        pool = random.choice(pools)["create_pool"]
    if not pool:
        log.error("Failed to retrieve pool details")
        return 1

    rados_obj.change_recover_threads(config=pool, action="set")
    acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool["pool_name"])
    log.info(f"Acting set {acting_pg_set}")
    if not acting_pg_set:
        log.error("Failed to retrieve acting pg set")
        return 1
    osd_id = acting_pg_set[0]
    host = rados_obj.fetch_host_node(daemon_type="osd", daemon_id=osd_id)
    if not host:
        log.error("Failed to fetch host details")
        return 1
    # fetch container id
    out, _ = host.exec_command(sudo=True, cmd="podman ps --format json")
    container_id = [
        item["Names"][0] for item in json.loads(out.read().decode())
        if f"osd.{osd_id}" in item["Command"]
    ][0]
    if not container_id:
        log.error("Failed to retrieve container id")
        return 1
    # fetch device path by osd_id
    vol_out, _ = host.exec_command(
        sudo=True,
        cmd=f"podman exec {container_id} ceph-volume lvm list --format json",
    )
    volume_out = vol_out.read().decode()
    dev_path = [
        v[0]["devices"][0] for k, v in json.loads(volume_out).items()
        if str(k) == str(osd_id)
    ][0]
    if not dev_path:
        log.error("Failed to get device path")
        return 1
    log.debug(
        f"device path  : {dev_path}, osd_id : {osd_id}, host.hostname : {host.hostname}"
    )
    utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=True)
    method_should_succeed(utils.set_osd_out, ceph_cluster, osd_id)
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    utils.osd_remove(ceph_cluster, osd_id)
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    method_should_succeed(utils.zap_device, ceph_cluster, host.hostname,
                          dev_path)
    method_should_succeed(wait_for_device,
                          host,
                          container_id,
                          osd_id,
                          action="remove")
    utils.add_osd(ceph_cluster, host.hostname, dev_path, osd_id)
    method_should_succeed(wait_for_device,
                          host,
                          container_id,
                          osd_id,
                          action="add")
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    do_rados_put(mon=client_node, pool=pool["pool_name"], nobj=1000)
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=False)
    rados_obj.change_recover_threads(config=pool, action="rm")

    if config.get("delete_pools"):
        for name in config["delete_pools"]:
            method_should_succeed(rados_obj.detete_pool, name)
        log.info("deleted all the given pools successfully")

    return 0
Beispiel #8
0
def run(ceph_cluster, **kw):
    """
    Test to create pool, then add , get , delete objects & Snapshots.
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    pool_obj = PoolFunctions(node=cephadm)
    client_node = rados_obj.ceph_cluster.get_nodes(role="client")[0]
    pool_target_configs = config["verify_client_pg_access"]["configurations"]
    num_snaps = config["verify_client_pg_access"]["num_snapshots"]
    log.debug(
        "Verifying the effects of rados put, get, snap & delete on pool with single PG"
    )

    # Creating pools and starting the test
    for entry in pool_target_configs.values():
        pool_name = entry["pool_name"]
        log.debug(
            f"Creating {entry['pool_type']} pool on the cluster with name {pool_name}"
        )
        if entry.get("pool_type", "replicated") == "erasure":
            method_should_succeed(rados_obj.create_erasure_pool,
                                  name=pool_name,
                                  **entry)
        else:
            method_should_succeed(
                rados_obj.create_pool,
                **entry,
            )

        # Creating and reading objects
        with parallel() as p:
            p.spawn(do_rados_put, client_node, pool_name, 500)
            p.spawn(do_rados_get, client_node, pool_name, 1)

        # Creating and deleting snapshots on the pool
        snapshots = []
        for _ in range(num_snaps):
            snap = pool_obj.create_pool_snap(pool_name=pool_name)
            if snap:
                snapshots.append(snap)
            else:
                log.error("Could not create snapshot on the pool")
                return 1

        if not pool_obj.delete_pool_snap(pool_name=pool_name):
            log.error("Could not delete the snapshots created")
            return 1

        # Deleting the objects created on the pool
        if not pool_obj.do_rados_delete(pool_name=pool_name):
            log.error("Could not delete the objects present on pool")
            return 1

        rados_obj.detete_pool(pool=pool_name)
        log.info(f"Completed all operations on pool {pool_name}")

    log.info(
        "Completed testing effects of rados put, get, snap & delete on pool with single PG"
    )
    return 0
Beispiel #9
0
def run(ceph_cluster, **kw):
    """
    Performs various pool related validation tests
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)

    if config.get("ec_pool_recovery_improvement"):
        ec_config = config.get("ec_pool_recovery_improvement")
        if not rados_obj.create_erasure_pool(name="recovery", **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if not rados_obj.bench_write(**ec_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**ec_config)
        log.info("Created the EC Pool, Finished writing data into the pool")

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        log.info(
            f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        log.info(
            "Stopped 'm' number of OSD's from, starting to wait for recovery")
        rados_obj.change_recover_threads(config=ec_config, action="set")

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "recovering",
                    "recovery_wait",
                    "backfilling_wait",
                    "peered",
                    "undersized",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info(
                    "The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 1 minute")
            time.sleep(60)

        # getting the acting set for the created pool after recovery
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        # Changing recovery threads back to default
        rados_obj.change_recover_threads(config=ec_config, action="rm")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # Sleep for 5 seconds for OSD's to join the cluster
        time.sleep(5)

        if not flag:
            log.error(
                "The pool did not reach active + Clean state after recovery")
            return 1

        # Deleting the pool created
        if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
            log.error(
                f"the pool {ec_config['pool_name']} could not be deleted")
            return 1

        log.info("Successfully tested EC pool recovery with K osd's surviving")
        return 0

    if config.get("Compression_tests"):
        """
        Create a 2 replicated pools:
        1. Pool_1 : enable any compression algorithm(def snappy) and compression mode(aggressive/force).
        2. Pool_2 : set compression mode to none
        Writing the same amount of data on 2 pools, size of pool with compression on would consume less space
        """
        pool_config = config["Compression_tests"]["pool_config"]
        compression_config = config["Compression_tests"]["compression_config"]
        pool_1 = pool_config["pool-1"]
        pool_2 = pool_config["pool-2"]

        if config["Compression_tests"]["pool_type"] == "replicated":
            if not rados_obj.create_pool(pool_name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            if not rados_obj.create_pool(pool_name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
        elif config["Compression_tests"]["pool_type"] == "erasure":
            pool_config["pool_name"] = pool_1
            if not rados_obj.create_erasure_pool(name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            pool_config["pool_name"] = pool_2
            if not rados_obj.create_erasure_pool(name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
            del pool_config["pool_name"]

        log.debug("Created two pools to test compression")

        # Enabling compression on pool-1
        if not rados_obj.pool_inline_compression(pool_name=pool_1,
                                                 **compression_config):
            log.error(
                f"Error setting compression on pool : {pool_1} for config {compression_config}"
            )
            return 1

        # Writing the same amount of data into two pools
        if not rados_obj.bench_write(pool_name=pool_1, **pool_config):
            log.error(
                "Failed to write objects into Pool-1, with compression enabled"
            )
            return 1

        if not rados_obj.bench_write(pool_name=pool_2, **pool_config):
            log.error(
                "Failed to write objects into Pool-2, without compression enabled"
            )
            return 1
        # Sleeping for 5 seconds for status to be updated.
        time.sleep(5)

        log.debug(
            "Finished writing data into the two pools. Checking pool stats")
        try:
            pool_stats = rados_obj.run_ceph_command(
                cmd="ceph df detail")["pools"]
            pool_1_stats = [
                detail for detail in pool_stats if detail["name"] == pool_1
            ][0]["stats"]
            pool_2_stats = [
                detail for detail in pool_stats if detail["name"] == pool_2
            ][0]["stats"]
        except KeyError:
            log.error(
                "No stats about the pools requested found on the cluster")
            return 1

        log.debug(f"Pool-1 stats: {pool_1_stats}")
        log.debug(f"Pool-2 stats: {pool_2_stats}")
        if pool_1_stats["compress_bytes_used"] < 0:
            log.error("No data stored under pool-1 is compressed")
            return 1

        if pool_1_stats["kb_used"] >= pool_2_stats["kb_used"]:
            log.error("Compression has no effect on the pool size...")
            return 1

        if config["Compression_tests"].get("verify_compression_ratio_set"):
            # added verification for test: CEPH-83571672
            if not rados_obj.check_compression_size(pool_name=pool_1,
                                                    **compression_config):
                log.error("data not compressed in accordance to ratio set")
                return 1

        log.info("Pool size is less when compression is enabled")
        return 0

    if config.get("check_autoscaler_profile"):
        """
        Verifies that the default auto-scaler profile on 5.1 builds in scale-up
        Verifies bugs : 1. https://bugzilla.redhat.com/show_bug.cgi?id=2021738
        """
        build = config.get("build", config.get("rhbuild"))
        autoscale_conf = config.get("check_autoscaler_profile")
        regex = r"5.1-rhel-\d{1}"
        if re.search(regex, build):
            log.info(
                "Test running on 5.1 builds, checking the default autoscaler profile"
            )
            if not mon_obj.verify_set_config(**autoscale_conf):
                log.error(
                    f"The default value for autoscaler profile is not scale-up in buld {build}"
                )
                return 1
            log.info(f"Autoscale profile is scale-up in release : {build}")
        else:
            log.debug(
                f"The profile is already scale-up by default in release : {build}"
            )
        return 0
def verify_mon_db_trim(ceph_cluster, node: CephAdmin, **kwargs):
    """
    The Mon DB size should be reduced by removing the old mappings regularly. To verify this behaviour,
    Creating various scenarios where the DB would be updated with new mappings and verify it DB is getting trimmed.
    Verifies BZ:
    https://bugzilla.redhat.com/show_bug.cgi?id=1905339
    https://bugzilla.redhat.com/show_bug.cgi?id=1829646
    https://bugzilla.redhat.com/show_bug.cgi?id=1972281
    https://bugzilla.redhat.com/show_bug.cgi?id=1943357
    https://bugzilla.redhat.com/show_bug.cgi?id=1766702
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
        node: Cephadm node where the commands need to be executed
        kwargs: Any other KV pairs that need to be passed for testing
    Returns: Exception hit for failure conditions.
    """

    # Creating rados object to run rados commands
    rados_obj = RadosOrchestrator(node=node)
    mon_nodes = ceph_cluster.get_nodes(role="mon")
    osd_nodes = ceph_cluster.get_nodes(role="osd")
    client_node = ceph_cluster.get_nodes(role="client")[0]
    installer_node = ceph_cluster.get_nodes(role="installer")[0]
    daemon_info = rados_obj.run_ceph_command(cmd="ceph orch ps")
    mon_daemons = [
        entry for entry in daemon_info if entry["daemon_type"] == "mon"
    ]

    # Duration for which we will sleep after the mon DB changes are made and mon would have begun trimming old mappings
    mon_db_trim_wait_dur = 1200

    # List to capture the mon db size throughout the duration of the test to check the variations in DB size
    mon_db_size_list = list()
    mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons))

    # Collecting first and last commits to osdmap
    status = rados_obj.run_ceph_command(cmd="ceph report")
    init_commmits = {
        "osdmap_first_committed": float(status["osdmap_first_committed"]),
        "osdmap_last_committed": float(status["osdmap_last_committed"]),
    }

    # creating scenarios where the mon db would be updated with new info
    change_config_for_slow_ops(rados_obj=rados_obj, action="set", **kwargs)
    mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons))

    # Starting read and write on by creating a test pool .
    pool_name = "test_pool_ops"
    if not rados_obj.create_pool(pool_name=pool_name,
                                 crush_rule="stretch_rule"):
        error = "failed to create pool to run IO"
        raise TestCaseFailureException(error)
    cmd = f"rados --no-log-to-stderr -b 1024 -p {pool_name} bench 400 write --no-cleanup &"
    client_node.exec_command(sudo=True, cmd=cmd)

    mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons))

    # deleting a previously created pool to increase OSD operations and map changes
    # Pool created as part of suite set-up workflow.
    rados_obj.detete_pool(pool="delete_pool")

    # Proceeding to reboot 1 OSD from each host to trigger rebalance & Backfill
    cluster_fsid = rados_obj.run_ceph_command(cmd="ceph fsid")["fsid"]
    daemon_info = rados_obj.run_ceph_command(cmd="ceph orch ps")
    osd_daemons = [
        entry for entry in daemon_info if entry["daemon_type"] == "osd"
    ]
    for onode in osd_nodes:
        for osd in osd_daemons:
            if re.search(osd["hostname"], onode.hostname):
                # Not using the container ID's provided in ceph orch ps command.
                # Bug : https://bugzilla.redhat.com/show_bug.cgi?id=1943494
                # cmd = f"podman restart {osd['container_id']}"
                cmd = f"systemctl restart ceph-{cluster_fsid}@osd.{osd['daemon_id']}.service"
                log.info(
                    f"rebooting osd-{osd['daemon_id']} on host {osd['hostname']}. Command {cmd}"
                )
                onode.exec_command(sudo=True, cmd=cmd)
                # Sleeping for 5 seconds for status to be updated
                time.sleep(5)
                break

    # Re-weighting the OSd's based on usage to trigger rebalance
    # todo: Verify re-balancing process on OSD's ( PG movement across cluster)
    # todo: Add re-balancing based on crush item provided
    # BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1766702
    rados_obj.reweight_crush_items()
    """
    Waiting for 2 hours for cluster to get to active + clean state.
    Rationale: during cluster activities like backfill, rebalance, osd map change cause Mon DB to be updated.
    Hence, we can wait till mon DB updates are completed, after which DB size should be reduced,
    by trimming the old mappings once the new mappings are added.
    If cluster healthy state is reached within 2 hours, we exit the loop earlier, without waiting for stipulated time,
    But if cluster is still performing operations for long time,
     we would need at-least some data to make sure DB is not just increasing.
      ( DB is expected to increase when operations are in progress.
       Old mappings are removed when operations/ new updates are completed. )
    """
    end_time = datetime.datetime.now() + datetime.timedelta(seconds=7200)
    while end_time > datetime.datetime.now():
        status_report = rados_obj.run_ceph_command(cmd="ceph report")
        ceph_health_status = status_report["health"]
        recovery_tuple = ("OSD_DOWN", "PG_AVAILABILITY", "PG_DEGRADED",
                          "SLOW_OPS")
        daemon_info = rados_obj.run_ceph_command(cmd="ceph orch ps")
        mon_daemons = [
            entry for entry in daemon_info if entry["daemon_type"] == "mon"
        ]
        mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons))

        # Checking for any health warnings that increase db size
        flag = (True if not any(key in ceph_health_status["checks"].keys()
                                for key in recovery_tuple) else False)

        # Proceeding to check if all PG's are in active + clean
        if flag:
            for entry in status_report["num_pg_by_state"]:
                rec = ("remapped", "backfilling", "degraded")
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

        if flag:
            log.info(
                f"The recovery and back-filling of the OSD is completed"
                f"Sleeping for {mon_db_trim_wait_dur} Seconds after clean for mon to remove old mappings"
            )
            time.sleep(mon_db_trim_wait_dur / 2)
            mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons))
            time.sleep(mon_db_trim_wait_dur)
            break
        log.info(
            f"Waiting for active + clean. Active aletrs: {ceph_health_status['checks'].keys()},"
            f" checking status again in 1 minute")
        time.sleep(60)

    # Checking if any slow operations are reported and waiting for 'dur' for the slow_ops to be cleared
    end_time = datetime.datetime.now() + datetime.timedelta(
        seconds=mon_db_trim_wait_dur)
    while end_time > datetime.datetime.now():
        if not get_slow_ops_data(
                node=node, installer=installer_node, action="current"):
            log.info("Operations in progress, checking again in 30 seconds")
            time.sleep(30)
            continue
        # Logging all the historic operations for reference / future enhancement
        get_slow_ops_data(node=node,
                          installer=installer_node,
                          action="historic")
        break

    # collecting the final size of the Mon DB and the OSD map epoch times
    daemon_info = rados_obj.run_ceph_command(cmd="ceph orch ps")
    mon_daemons = [
        entry for entry in daemon_info if entry["daemon_type"] == "mon"
    ]
    final_db_size = get_mondb_size(mon_nodes[0], mon_daemons)
    final_status = rados_obj.run_ceph_command(cmd="ceph report")
    final_commmits = {
        "osdmap_first_committed":
        float(final_status["osdmap_first_committed"]),
        "osdmap_last_committed": float(final_status["osdmap_last_committed"]),
    }

    mon_db_max_size = max(mon_db_size_list)
    # Getting the trend of mon DB size when the operations were running on cluster.
    mon_db_size_size_change = list()
    for i in range(len(mon_db_size_list) - 1):
        mon_db_size_size_change.append(mon_db_size_list[i + 1] -
                                       mon_db_size_list[i])

    # Reverting the config changes made for generation slow_ops
    change_config_for_slow_ops(rados_obj=rados_obj, action="rm", **kwargs)

    # Checking the final results
    if True not in list(map(lambda x: x <= 0, mon_db_size_size_change)):
        error = f"The mon DB is only increasing since the test begun. DB sizes {mon_db_size_list}"
        raise TestCaseFailureException(error)

    if not final_db_size <= mon_db_max_size:
        error = (
            f"The mon DB size after cluster clean is higher than when operations were being performed.\n"
            f"max size during operations : {mon_db_max_size} , final DB size after clean {final_db_size}"
        )
        log.error(error)
        raise TestCaseFailureException()

    # Initial update of OSD maps can be the same at the beginning and end of test
    if (final_commmits["osdmap_first_committed"] <
            init_commmits["osdmap_first_committed"]):
        error = (
            f"The OSD map has not been updated of first commits\n"
            f"The commits are initial commits : {init_commmits}, final commits : {final_commmits}"
        )
        log.error(error)
        raise TestCaseFailureException()

    # Final updates need to be more than initial updates as there are OSD map changes during the duration of test
    if (final_commmits["osdmap_last_committed"] <=
            init_commmits["osdmap_last_committed"]):
        error = (
            f"The OSD map has not been updated of last commits\n"
            f"The commits are initial commits : {init_commmits}, final commits : {final_commmits}"
        )
        log.error(error)
        raise TestCaseFailureException()

    # The number of OSD mappings present on cluster should not exceed 800 in total
    # https://tracker.ceph.com/issues/37875#note-1
    if (final_commmits["osdmap_last_committed"] -
            final_commmits["osdmap_first_committed"]) > 800:
        error = (
            f"There are still too many old commits in Mon DB. OSD map not trimmed as per needed\n"
            f"The commits are initial commits : {init_commmits}, final commits : {final_commmits}"
        )
        log.error(error)
        raise TestCaseFailureException()

    # Checking the paxos trimming sizes
    if (int(final_status["paxos"]["last_committed"]) -
            int(final_status["paxos"]["first_committed"])) > 1000:
        error = (
            f"There are still too many old commits in Mon DB.\n"
            f"The commits are initial commits : {final_status['paxos']['first_committed']},"
            f" final commits : {final_status['paxos']['last_committed']}")
        log.error(error)
        raise TestCaseFailureException()

    log.info("mon DB was trimmed successfully")
Beispiel #11
0
def run(ceph_cluster, **kw):
    """
    Performs various pool related validation tests
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)
    pool_obj = PoolFunctions(node=cephadm)

    if config.get("ec_pool_recovery_improvement"):
        ec_config = config.get("ec_pool_recovery_improvement")
        if not rados_obj.create_erasure_pool(name="recovery", **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if not rados_obj.bench_write(**ec_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**ec_config)
        log.info("Created the EC Pool, Finished writing data into the pool")

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        log.info(
            f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        log.info(
            "Stopped 'm' number of OSD's from, starting to wait for recovery")
        rados_obj.change_recover_threads(config=ec_config, action="set")

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "recovering",
                    "recovery_wait",
                    "backfilling_wait",
                    "peered",
                    "undersized",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info(
                    "The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 1 minute")
            time.sleep(60)

        # getting the acting set for the created pool after recovery
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        # Changing recovery threads back to default
        rados_obj.change_recover_threads(config=ec_config, action="rm")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # Sleep for 5 seconds for OSD's to join the cluster
        time.sleep(5)

        if not flag:
            log.error(
                "The pool did not reach active + Clean state after recovery")
            return 1

        # Deleting the pool created
        if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
            log.error(
                f"the pool {ec_config['pool_name']} could not be deleted")
            return 1

        log.info("Successfully tested EC pool recovery with K osd's surviving")
        return 0

    if config.get("Compression_tests"):
        """
        Create a 2 replicated pools:
        1. Pool_1 : enable any compression algorithm(def snappy) and compression mode(aggressive/force).
        2. Pool_2 : set compression mode to none
        Writing the same amount of data on 2 pools, size of pool with compression on would consume less space
        """
        pool_config = config["Compression_tests"]["pool_config"]
        compression_config = config["Compression_tests"]["compression_config"]
        pool_1 = pool_config["pool-1"]
        pool_2 = pool_config["pool-2"]

        if config["Compression_tests"]["pool_type"] == "replicated":
            if not rados_obj.create_pool(pool_name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            if not rados_obj.create_pool(pool_name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
        elif config["Compression_tests"]["pool_type"] == "erasure":
            pool_config["pool_name"] = pool_1
            if not rados_obj.create_erasure_pool(name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            pool_config["pool_name"] = pool_2
            if not rados_obj.create_erasure_pool(name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
            del pool_config["pool_name"]

        log.debug("Created two pools to test compression")

        # Enabling compression on pool-1
        if not rados_obj.pool_inline_compression(pool_name=pool_1,
                                                 **compression_config):
            log.error(
                f"Error setting compression on pool : {pool_1} for config {compression_config}"
            )
            return 1

        # Writing the same amount of data into two pools
        if not rados_obj.bench_write(pool_name=pool_1, **pool_config):
            log.error(
                "Failed to write objects into Pool-1, with compression enabled"
            )
            return 1

        if not rados_obj.bench_write(pool_name=pool_2, **pool_config):
            log.error(
                "Failed to write objects into Pool-2, without compression enabled"
            )
            return 1
        # Sleeping for 5 seconds for status to be updated.
        time.sleep(5)

        log.debug(
            "Finished writing data into the two pools. Checking pool stats")
        try:
            pool_stats = rados_obj.run_ceph_command(
                cmd="ceph df detail")["pools"]
            pool_1_stats = [
                detail for detail in pool_stats if detail["name"] == pool_1
            ][0]["stats"]
            pool_2_stats = [
                detail for detail in pool_stats if detail["name"] == pool_2
            ][0]["stats"]
        except KeyError:
            log.error(
                "No stats about the pools requested found on the cluster")
            return 1

        log.debug(f"Pool-1 stats: {pool_1_stats}")
        log.debug(f"Pool-2 stats: {pool_2_stats}")
        if pool_1_stats["compress_bytes_used"] < 0:
            log.error("No data stored under pool-1 is compressed")
            return 1

        if pool_1_stats["kb_used"] >= pool_2_stats["kb_used"]:
            log.error("Compression has no effect on the pool size...")
            return 1

        if config["Compression_tests"].get("verify_compression_ratio_set"):
            # added verification for test: CEPH-83571672
            if not rados_obj.check_compression_size(pool_name=pool_1,
                                                    **compression_config):
                log.error("data not compressed in accordance to ratio set")
                return 1

        log.info("Pool size is less when compression is enabled")
        return 0

    if config.get("test_autoscaler_bulk_feature"):
        """
        Tests to verify the autoscaler bulk flag, which allows pools to make use of
        scale-down profile, making those pools start with full compliments of PG sets.
        Tests include
        1. creating new pools with bulk,
        2. enabling/disabling bulk flag on existing pools
        3. Verify the PG changes when the flag is set/unset
        Verifies bugs : https://bugzilla.redhat.com/show_bug.cgi?id=2049851
        """
        regex = r"\s*(\d.\d)-rhel-\d"
        build = (re.search(regex,
                           config.get("build",
                                      config.get("rhbuild")))).groups()[0]
        if not float(build) > 5.0:
            log.info(
                "Test running on version less than 5.1, skipping verifying bulk flags"
            )
            return 0

        # Creating a pool with bulk feature
        pool_name = config.get("pool_name")
        if not pool_obj.set_bulk_flag(pool_name=pool_name):
            log.error("Failed to create a pool with bulk features")
            return 1

        # Checking the autoscaler status, final PG counts, bulk flags
        pg_target_init = pool_obj.get_target_pg_num_bulk_flag(
            pool_name=pool_name)

        # Unsetting the bulk flag and checking the change in the PG counts
        if not pool_obj.rm_bulk_flag(pool_name=pool_name):
            log.error("Failed to create a pool with bulk features")
            return 1

        # Sleeping for 5 seconds for new PG num to bets et
        time.sleep(5)
        pg_target_interim = pool_obj.get_target_pg_num_bulk_flag(
            pool_name=pool_name)

        # The target PG's once the flag is disabled must be lesser than when enabled
        if pg_target_interim >= pg_target_init:
            log.error("PG's not reduced after bulk flag disabled")
            return 1

        # Setting the bulk flag on pool again and checking the change in the PG counts
        if not pool_obj.set_bulk_flag(pool_name=pool_name):
            log.error("Failed to disable/remove bulk features on pool")
            return 1

        # Sleeping for 5 seconds for new PG num to bets et
        time.sleep(5)

        pg_target_final = pool_obj.get_target_pg_num_bulk_flag(
            pool_name=pool_name)

        # The target PG's once the flag is disabled must be lesser than when enabled
        if pg_target_interim >= pg_target_final:
            log.error("PG's not Increased after bulk flag Enabled")
            return 1

        if config.get("delete_pool"):
            rados_obj.detete_pool(pool=pool_name)
        log.info("Verified the workings of bulk flag")
        return 0

    if config.get("verify_pool_target_ratio"):
        log.debug("Verifying target size ratio on pools")
        target_configs = config["verify_pool_target_ratio"]["configurations"]
        # Creating pools and starting the test
        for entry in target_configs.values():
            log.debug(f"Creating {entry['pool_type']} pool on the cluster")
            if entry.get("pool_type", "replicated") == "erasure":
                method_should_succeed(rados_obj.create_erasure_pool,
                                      name=entry["pool_name"],
                                      **entry)
            else:
                method_should_succeed(
                    rados_obj.create_pool,
                    **entry,
                )
            rados_obj.bench_write(**entry)
            if not pool_obj.verify_target_ratio_set(
                    pool_name=entry["pool_name"],
                    ratio=entry["target_size_ratio"]):
                log.error(
                    f"Could not change the target ratio on the pool: {entry['pool_name']}"
                )
                return 1
            log.debug("Set the ratio. getting the projected pg's")

            rados_obj.change_recover_threads(config=config, action="set")
            log.debug(
                "Waiting for the rebalancing to complete on the cluster after the change"
            )
            # Sleeping for 2 minutes for rebalancing to start & for new PG count to be updated.
            time.sleep(120)

            new_pg_count = int(
                pool_obj.get_pg_autoscaler_value(pool_name=entry["pool_name"],
                                                 item="pg_num_target"))
            if new_pg_count <= entry["pg_num"]:
                log.error(
                    f"Count of PG's not increased on the pool: {entry['pool_name']}"
                    f"Initial creation count : {entry['pg_num']}"
                    f"New count after setting num target : {new_pg_count}")
                return 1

            res = wait_for_clean_pg_sets(rados_obj)
            if not res:
                log.error(
                    "PG's in cluster are not active + Clean after the ratio change"
                )
                return 1
            if not pool_obj.verify_target_ratio_set(
                    pool_name=entry["pool_name"], ratio=0.0):
                log.error(
                    f"Could not remove the target ratio on the pool: {entry['pool_name']}"
                )
                return 1

            # Sleeping for 2 minutes for rebalancing to start & for new PG count to be updated.
            time.sleep(120)
            # Checking if after the removal of ratio, the PG count has reduced
            end_pg_count = int(
                pool_obj.get_pg_autoscaler_value(pool_name=entry["pool_name"],
                                                 item="pg_num_target"))
            if end_pg_count >= new_pg_count:
                log.error(
                    f"Count of PG's not changed/ reverted on the pool: {entry['pool_name']}"
                    f" after removing the target ratios")
                return 1
            rados_obj.change_recover_threads(config=config, action="rm")
            if entry.get("delete_pool", False):
                rados_obj.detete_pool(pool=entry["pool_name"])
            log.info(
                f"Completed the test of target ratio on pool: {entry['pool_name']} "
            )
        log.info("Target ratio tests completed")
        return 0

    if config.get("verify_mon_target_pg_per_osd"):
        pg_conf = config.get("verify_mon_target_pg_per_osd")
        if not mon_obj.set_config(**pg_conf):
            log.error("Could not set the value for mon_target_pg_per_osd ")
            return 1
        mon_obj.remove_config(**pg_conf)
        log.info("Set and verified the value for mon_target_pg_per_osd ")
        return 0

    if config.get("verify_pg_num_min"):
        log.debug("Verifying pg_num_min on pools")
        target_configs = config["verify_pg_num_min"]["configurations"]
        # Creating pools and starting the test
        for entry in target_configs.values():
            log.debug(f"Creating {entry['pool_type']} pool on the cluster")
            if entry.get("pool_type", "replicated") == "erasure":
                method_should_succeed(rados_obj.create_erasure_pool,
                                      name=entry["pool_name"],
                                      **entry)
            else:
                method_should_succeed(
                    rados_obj.create_pool,
                    **entry,
                )
            rados_obj.bench_write(**entry)

            if not rados_obj.set_pool_property(pool=entry["pool_name"],
                                               props="pg_num_min",
                                               value=entry["pg_num_min"]):
                log.error("Could not set the pg_min_size on the pool")
                return 1

            if entry.get("delete_pool", False):
                rados_obj.detete_pool(pool=entry["pool_name"])
            log.info(
                f"Completed the test of pg_min_num on pool: {entry['pool_name']} "
            )
        log.info("pg_min_num tests completed")
        return 0
Beispiel #12
0
def run(ceph_cluster, **kw):
    """
    Changes b/w various election strategies and observes mon quorum behaviour
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonElectionStrategies(rados_obj=rados_obj)
    cephadm_node_mon = ceph_cluster.get_nodes(role="installer")[0]

    # Collecting the number of mons in the quorum before the test
    mon_init_count = len(mon_obj.get_mon_quorum().keys())

    # By default, the election strategy is classic. Verifying that
    strategy = mon_obj.get_election_strategy()
    if strategy != 1:
        log.error(
            f"cluster created election strategy other than classic, i.e {strategy}"
        )
        return 1

    # Changing strategy to 2. i.e disallowed mode.
    if not mon_obj.set_election_strategy(mode="disallow"):
        log.error("could not set election strategy to disallow mode")
        return 1

    # sleeping for 2 seconds for new elections to be triggered and new leader to be elected
    time.sleep(2)

    log.info("Set election strategy to disallow mode. adding disallowed mons")
    # Checking if new leader will be chosen if leader is added to disallowed list
    old_leader = mon_obj.get_mon_quorum_leader()
    if not mon_obj.set_disallow_mon(mon=old_leader):
        log.error(f"could not add mon: {old_leader} to the disallowed list")
        return 1

    # sleeping for 2 seconds for new elections to be triggered and new leader to be elected
    time.sleep(2)

    current_leader = mon_obj.get_mon_quorum_leader()
    if re.search(current_leader, old_leader):
        log.error(
            f"The mon: {old_leader} added to disallow list is still leader")
        return 1

    # removing the mon from the disallowed list
    if not mon_obj.remove_disallow_mon(mon=old_leader):
        log.error(f"could not remove mon: {old_leader} from disallowed list")
        return 1

    # sleeping for 2 seconds for new elections to be triggered and new leader to be elected
    time.sleep(2)

    # Changing strategy to 3. i.e Connectivity mode.
    if not mon_obj.set_election_strategy(mode="connectivity"):
        log.error("could not set election strategy to connectivity mode")
        return 1

    # Checking connectivity scores of all the mons
    cmd = f"ceph daemon mon.{cephadm_node_mon.hostname} connection scores dump"
    rados_obj.run_ceph_command(cmd=cmd)

    # Changing strategy to default
    if not mon_obj.set_election_strategy(mode="classic"):
        log.error("could not set election strategy to classic mode")
        return 1

    # sleeping for 5 seconds for new elections to be triggered and new leader to be elected
    time.sleep(5)

    # Collecting the number of mons in the quorum after the test
    # todo: add other tests to ascertain the health of mon daemons in quorum
    mon_final_count = len(mon_obj.get_mon_quorum().keys())
    if mon_init_count < mon_final_count:
        log.error(
            "There are less mons in the quorum at the end than there before")
        return 1

    log.info("Completed all mon election test cases")
    return 0
Beispiel #13
0
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Actions Performed:
    1. Disables the automatic crush map update
    2. Collects the OSD daemons in the cluster and split them into 2 sites.
    3. If add capacity is selected, only half of the OSD's will be added to various sites initially.
    4. Adds the stretch rule into crush map.
    5. Adding monitors into the 2 sites.
    6. Create a replicated pool and deploy stretch mode.
    7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites )
    8. Check for the bump in election epochs throughout.
    9. Check the acting set in PG for 4 OSD's. 2 from each site.
    Verifies bugs:
    [1]. https://bugzilla.redhat.com/show_bug.cgi?id=1937088
    [2]. https://bugzilla.redhat.com/show_bug.cgi?id=1952763
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonElectionStrategies(rados_obj=rados_obj)
    client_node = ceph_cluster.get_nodes(role="client")[0]
    tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0]

    if not client_node and not tiebreaker_node:
        log.error(
            "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster"
        )
        return 1
    mon_state = get_mon_details(node=cephadm)
    if len(list(mon_state["monitors"])) < 5:
        log.error(
            f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}"
        )
        return 1
    osd_details = get_osd_details(node=cephadm)
    if len(osd_details.keys()) < 4:
        log.error(
            f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}"
        )
        return 1

    if config.get("verify_forced_recovery"):
        log.info("Verifying forced recovery and healthy in stretch environment")

        pool_name = "stretch_pool_recovery"
        if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
            log.error("Failed to create the replicated Pool")
            return 1

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool_name)

        log.info(
            f"Killing 2 OSD's from acting set : {acting_pg_set} to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(2)]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        log.info("Stopped 2 OSD's from acting set, starting to wait for recovery")
        rados_obj.change_recover_threads(config=config, action="set")

        if not rados_obj.bench_write(pool_name=pool_name, **config):
            log.error("Failed to write objects into the Pool")
            return 1

        log.debug("Triggering forced recovery in stretch mode")
        cmd = "ceph osd force_recovery_stretch_mode --yes-i-really-mean-it"
        rados_obj.run_ceph_command(cmd)
        log.info("Triggered the recovery in stretch mode")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # there was data written into pool when the OSD's were down.
        # Verifying if data is recovered and placed into the OSD's after bringing them back
        res = wait_for_clean_pg_sets(rados_obj)
        if not res:
            log.error("PG's in cluster are not active + Clean ")
            return 1

        log.debug("Forcing the stretch cluster into healthy mode")
        cmd = "ceph osd force_healthy_stretch_mode --yes-i-really-mean-it"
        rados_obj.run_ceph_command(cmd)

        log.info("Cluster has successfully recovered and is in healthy state")
        return 0

    # Finding and Deleting any stray EC pools that might have been left on cluster
    pool_dump = rados_obj.run_ceph_command(cmd="ceph osd dump")
    for entry in pool_dump["pools"]:
        if entry["type"] != 1 and entry["crush_rule"] != 0:
            log.info(
                f"A non-replicated pool found : {entry['pool_name']}, proceeding to delete pool"
            )
            if not rados_obj.detete_pool(pool=entry["pool_name"]):
                log.error(f"the pool {entry['pool_name']} could not be deleted")
                return 1
        log.debug("No pools other than replicated found on cluster")

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    site1 = config.get("site1", "site1")
    site2 = config.get("site2", "site2")

    # Collecting osd details and split them into Sita A and Site B
    sorted_osds = sort_osd_sites(all_osd_details=osd_details)
    site_a_osds = sorted_osds[0]
    site_b_osds = sorted_osds[1]
    if config.get("perform_add_capacity"):
        site_a_osds = sorted_osds[0][: (len(sorted_osds[0]) // 2)]
        site_b_osds = sorted_osds[1][: (len(sorted_osds[1]) // 2)]

    if not set_osd_sites(
        node=cephadm,
        osds=site_a_osds,
        site=site1,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    if not set_osd_sites(
        node=cephadm,
        osds=site_b_osds,
        site=site2,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    # collecting mon map to be compared after strtech cluster deployment
    stretch_rule_name = "stretch_rule"
    if not setup_crush_rule(
        node=client_node, rule_name=stretch_rule_name, site1=site1, site2=site2
    ):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Setting the election strategy to connectivity mode
    if not mon_obj.set_election_strategy(mode="connectivity"):
        log.error("could not set election strategy to connectivity mode")
        return 1

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)
    init_mon_state = get_mon_details(node=cephadm)

    # Checking if mon elections happened after changing election strategy
    if mon_state["epoch"] > init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after setting the connectivity mode.")
        return 1

    # Checking updated election strategy in mon map
    strategy = mon_obj.get_election_strategy()
    if strategy != 3:
        log.error(
            f"cluster created election strategy other than connectivity, i.e {strategy}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    log.info(f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3")
    if not set_mon_sites(
        node=cephadm, tiebreaker_node=tiebreaker_node, site1=site1, site2=site2
    ):
        log.error("Failed to ad monitors into respective sites")
        return 1

    # All the existing pools should be automatically changed with stretch rule. Creating a test pool
    pool_name = "test_pool_1"
    if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
        log.error("Failed to create the replicated Pool")
        return 1

    log.info("Monitors added to respective sites. enabling stretch rule")
    cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1

    if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after Enabling strech mode")
        return 1

    # Increasing backfill/rebalance threads so that cluster will re-balance it faster
    rados_obj.change_recover_threads(config=config, action="set")

    # wait for active + clean after deployment of stretch mode
    # checking the state after deployment coz of BZ : https://bugzilla.redhat.com/show_bug.cgi?id=2025800
    res = wait_for_clean_pg_sets(rados_obj)
    if not res:
        status_report = rados_obj.run_ceph_command(cmd="ceph report")
        # Proceeding to check if all PG's are in active + clean
        for entry in status_report["num_pg_by_state"]:
            rec = ("remapped", "peering")
            if any(key in rec for key in entry["state"].split("+")):
                log.error(
                    "PG's in cluster are stuck in remapped+peering after stretch deployment."
                )
                return 1

    if config.get("perform_add_capacity"):
        pool_name = "test_stretch_pool"
        if not rados_obj.create_pool(
            pool_name=pool_name,
            crush_rule=stretch_rule_name,
        ):
            log.error("Failed to create the replicated Pool")
            return 1
        do_rados_put(mon=client_node, pool=pool_name, nobj=100)

        log.info("Performing add Capacity after the deployment of stretch cluster")
        site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds]
        site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds]

        if not set_osd_sites(
            node=cephadm,
            osds=site_a_osds,
            site=site1,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1
        if not set_osd_sites(
            node=cephadm,
            osds=site_b_osds,
            site=site2,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1

        flag = wait_for_clean_pg_sets(rados_obj)
        if not flag:
            log.error(
                "The cluster did not reach active + Clean state after add capacity"
            )
            return 1

        with parallel() as p:
            p.spawn(do_rados_get, client_node, pool_name, 10)
            for res in p:
                log.info(res)
        log.info("Successfully completed Add Capacity scenario")

    rados_obj.change_recover_threads(config=config, action="rm")

    # Checking if the pools have been updated with the new crush rules
    acting_set = rados_obj.get_pg_acting_set(pool_name=pool_name)
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0
Beispiel #14
0
def run(ceph_cluster, **kw):
    """
    Test to create a large number of omap entries on the single PG pool and test osd resiliency
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    pool_obj = PoolFunctions(node=cephadm)

    pool_target_configs = config["verify_osd_omap_entries"]["configurations"]
    omap_target_configs = config["verify_osd_omap_entries"]["omap_config"]

    # Creating pools and starting the test
    for entry in pool_target_configs.values():
        log.debug(
            f"Creating {entry['pool_type']} pool on the cluster with name {entry['pool_name']}"
        )
        if entry.get("pool_type", "replicated") == "erasure":
            method_should_succeed(rados_obj.create_erasure_pool,
                                  name=entry["pool_name"],
                                  **entry)
        else:
            method_should_succeed(
                rados_obj.create_pool,
                **entry,
            )

        log.debug(
            "Created the pool. beginning to create large number of omap entries on the pool"
        )
        if not pool_obj.fill_omap_entries(pool_name=entry["pool_name"],
                                          **omap_target_configs):
            log.error(
                f"Omap entries not generated on pool {entry['pool_name']}")
            return 1

        # Fetching the current acting set for the pool
        acting_set = rados_obj.get_pg_acting_set(pool_name=entry["pool_name"])
        rados_obj.change_recover_threads(config={}, action="set")
        log.debug(
            f"Proceeding to restart OSd's from the acting set {acting_set}")
        for osd_id in acting_set:
            rados_obj.change_osd_state(action="stop", target=osd_id)
            # sleeping for 5 seconds for re-balancing to begin
            time.sleep(5)

            # Waiting for cluster to get clean state after OSD stopped
            if not wait_for_clean_pg_sets(rados_obj):
                log.error("PG's in cluster are not active + Clean state.. ")
                return 1
            rados_obj.change_osd_state(action="restart", target=osd_id)
            log.debug(
                f"Cluster reached clean state after osd {osd_id} stop and restart"
            )

        rados_obj.change_recover_threads(config={}, action="rm")
        # deleting the pool created after the test
        rados_obj.detete_pool(pool=entry["pool_name"])

        log.info(
            f"All the OSD's from the acting set {acting_set} were restarted "
            f"and object movement completed for pool {entry['pool_name']}")

    log.info(
        "Completed testing effects of large number of omap entries on pools ")
    return 0
Beispiel #15
0
def run(ceph_cluster, **kw):
    """
    Performs various pool related validation tests
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)

    if config.get("ec_pool_recovery_improvement"):
        ec_config = config.get("ec_pool_recovery_improvement")
        if not rados_obj.create_erasure_pool(name="recovery", **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if not rados_obj.bench_write(**ec_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**ec_config)
        log.info("Created the EC Pool, Finished writing data into the pool")

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        log.info(
            f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        log.info("Stopped 'm' number of OSD's from, starting to wait for recovery")
        rados_obj.change_recover_threads(config=ec_config, action="set")

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "recovering",
                    "recovery_wait",
                    "backfilling_wait",
                    "peered",
                    "undersized",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info("The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 1 minute"
            )
            time.sleep(60)

        # getting the acting set for the created pool after recovery
        acting_pg_set = rados_obj.get_pg_acting_set(pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        # Changing recovery threads back to default
        rados_obj.change_recover_threads(config=ec_config, action="rm")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # Sleep for 5 seconds for OSD's to join the cluster
        time.sleep(5)

        if not flag:
            log.error("The pool did not reach active + Clean state after recovery")
            return 1

        # Deleting the pool created
        if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
            log.error(f"the pool {ec_config['pool_name']} could not be deleted")
            return 1

        log.info("Successfully tested EC pool recovery with K osd's surviving")
        return 0
Beispiel #16
0
class PoolFunctions:
    """
    Contains various functions that help in altering the behaviour, working of pools and verify the changes
    """
    def __init__(self, node: CephAdmin):
        """
        initializes the env to run rados commands
        Args:
            node: CephAdmin object
        """
        self.rados_obj = RadosOrchestrator(node=node)

    def verify_target_ratio_set(self, pool_name, ratio):
        """
        Sets the "target_size_ratio" on the given pool and verifies it from the auto-scale status
        Args:
            pool_name: name of the pool
            ratio: ratio to be set

        Returns: True -> pass, False -> fail

        """
        log.debug(f"Setting ratio: {ratio} on pool: {pool_name}")
        self.rados_obj.set_pool_property(pool=pool_name,
                                         props="target_size_ratio",
                                         value=ratio)

        # sleeping for 2 seconds for pg autoscaler updates the status and new PG's
        time.sleep(2)
        ratio_set = self.get_pg_autoscaler_value(pool_name,
                                                 item="target_ratio")
        if not ratio_set == ratio:
            log.error("specified target ratio not set on the pool")
            return False
        return True

    def get_pg_autoscaler_value(self, pool_name, item):
        """
        Fetches the target ratio set on the pool given
        Args:
            pool_name: name of the pool
            item: Value of the item to be fetched.
                Allowed values: actual_capacity_ratio|actual_raw_used|bias|capacity_ratio|crush_root_id|target_bytes|
                effective_target_ratio|logical_used|pg_autoscale_mode|pg_num_target|pool_id|raw_used|target_ratio|

        Returns: Requested value
        """
        cmd = "ceph osd pool autoscale-status"
        autoscale_status = self.rados_obj.run_ceph_command(cmd=cmd)
        try:
            pool_details = [
                details for details in autoscale_status
                if details["pool_name"] == pool_name
            ][0]
        except Exception:
            log.error("Pool not found")
        return pool_details[item]

    def fill_omap_entries(self, pool_name, **kwargs):
        """
        creates key-value entries for objects on ceph pools and increase the omap entries on the pool
        eg : if obj_start, obj_end: 0, 3000 objects, with num_keys 1000,  the method would create 3000 objects with 1k
        KW pairs each. so total 3000*1000 KW entries
        Args:
            pool_name: name of the pool where the KW pairs needed to be added to objects
            **kwargs: other args that can be passed
                Valid args:
                1. obj_start: start count for object creation
                2. obj_end : end count for object creation
                3. num_keys_obj: Number of KW paris to be added to each object

        Returns: True -> pass, False -> fail
        """
        # Getting the client node to perform the operations
        client_node = self.rados_obj.ceph_cluster.get_nodes(role="client")[0]
        obj_start = kwargs.get("obj_start", 0)
        obj_end = kwargs.get("obj_end", 2000)
        num_keys_obj = kwargs.get("num_keys_obj", 20000)
        log.debug(f"Writing {(obj_end - obj_start) * num_keys_obj} Key paris"
                  f" to increase the omap entries on pool {pool_name}")
        script_loc = "https://raw.githubusercontent.com/red-hat-storage/cephci/master/utility/generate_omap_entries.py"
        client_node.exec_command(
            sudo=True,
            cmd=f"curl -k {script_loc} -O",
        )
        # Setup Script pre-requisites : docopt
        client_node.exec_command(sudo=True,
                                 cmd="pip3 install docopt",
                                 long_running=True)

        cmd_options = f"--pool {pool_name} --start {obj_start} --end {obj_end} --key-count {num_keys_obj}"
        cmd = f"python3 generate_omap_entries.py {cmd_options}"
        client_node.exec_command(sudo=True, cmd=cmd, long_running=True)

        # removing the py file copied
        client_node.exec_command(sudo=True,
                                 cmd="rm -rf generate_omap_entries.py")

        log.debug("Checking the amount of omap entries created on the pool")
        pool_stats = self.rados_obj.run_ceph_command(
            cmd="ceph df detail")["pools"]
        for detail in pool_stats:
            if detail["name"] == pool_name:
                pool_1_stats = detail["stats"]
                total_omap_data = pool_1_stats["omap_bytes_used"]
                omap_data = pool_1_stats["stored_omap"]
                break
        if omap_data < 0:
            log.error("No omap entries written into pool")
            return False
        log.info(f"Wrote {omap_data} bytes of omap data on the pool."
                 f"Total stored omap data on pool : {total_omap_data}")
        return True

    def do_rados_delete(self, pool_name: str, pg_id: str = None):
        """
        deletes all the objects from the given pool / PG ID
        Args:
            1. pool_name: name of the pool
            2. [ pg_id ]: Pg ID (Optional, but when provided, should be passed along with pool name )

        Returns: True -> pass, False -> fail
        """
        obj_cmd = f"rados -p {pool_name} ls"
        if pg_id:
            obj_cmd = f"rados --pgid {pg_id} ls"

        delete_obj_list = self.rados_obj.run_ceph_command(cmd=obj_cmd,
                                                          timeout=1000)
        for obj in delete_obj_list:
            cmd = f"rados -p {pool_name} rm {obj['name']}"
            self.rados_obj.node.shell([cmd], long_running=True)

            # Sleeping for 3 seconds for object reference to be deleted
            time.sleep(3)

            # Checking if object is still present in the pool
            out = self.rados_obj.run_ceph_command(cmd=obj_cmd, timeout=1000)
            rem_objs = [obj["name"] for obj in out]
            if obj["name"] in rem_objs:
                log.error(f"Object {obj['name']} not deleted in the pool")
                return False
            log.debug(f"deleted object: {obj['name']} from pool {pool_name}")
        log.info(f"Completed deleting all objects from pool {pool_name}")
        return True

    def create_pool_snap(self, pool_name: str):
        """
        Creates snapshots of the given pool
        Args:
            pool_name: name of the pool
        Returns: Pass -> name of the snapshot created, Fail -> False

        """
        # Checking if snapshots can be created on the supplied pool
        cmd = "ceph osd dump"
        pool_status = self.rados_obj.run_ceph_command(cmd=cmd, timeout=800)
        for detail in pool_status["pools"]:
            if detail["pool_name"] != pool_name:
                continue
            if "selfmanaged_snaps" in detail["flags_names"]:
                # bz: https://bugzilla.redhat.com/show_bug.cgi?id=1425803#c2
                log.error(
                    f"Pool {pool_name} is a self managed pool, cannot create snaps manually"
                )
                return False

        # Creating snaps on the pool provided
        cmd = "uuidgen"
        out, err = self.rados_obj.node.shell([cmd])
        uuid = out[0:5]
        snap_name = f"{pool_name}-snap-{uuid}"
        cmd = f"ceph osd pool mksnap {pool_name} {snap_name}"
        self.rados_obj.node.shell([cmd], long_running=True)

        # Checking if snap was created successfully
        if not self.check_snap_exists(snap_name=snap_name,
                                      pool_name=pool_name):
            log.error("Snapshot of pool not created")
            return False
        log.debug(f"Created snapshot {snap_name} on pool {pool_name}")
        return snap_name

    def check_snap_exists(self, snap_name: str, pool_name: str) -> bool:
        """
        checks the existence of the snapshot name given on the pool
        Args:
            snap_name: Name of the snapshot
            pool_name: Name of the pool

        Returns: True -> Snapshot exists, False -> snapshot does not exist
        """
        snap_list = self.get_snap_names(pool_name=pool_name)
        return True if snap_name in snap_list else False

    def get_snap_names(self, pool_name: str) -> list:
        """
        Fetches the list of snapshots created on the given pool
        Args:
            pool_name: name of the pool

        Returns: list of the snaps created
        """
        cmd = "ceph osd dump"
        pool_status = self.rados_obj.run_ceph_command(cmd=cmd, timeout=800)
        for detail in pool_status["pools"]:
            if detail["pool_name"] == pool_name:
                snap_list = [snap["name"] for snap in detail["pool_snaps"]]
                log.debug(f"snapshots on pool : {snap_list}")
        return snap_list

    def delete_pool_snap(self, pool_name: str, snap_name: str = None) -> bool:
        """
        deletes snapshots of the given pool. If no snap name is provided, deletes all the snapshots on the pool
        Args:
            pool_name: name of the pool
            snap_name: name of the snapshot
        Returns: Pass -> snapshot Deleted, Fail -> snapshot not Deleted

        """
        if snap_name:
            delete_list = list(snap_name)
        else:
            delete_list = self.get_snap_names(pool_name=pool_name)

        # Deleting snaps on the pool provided
        for snap in delete_list:
            cmd = f"ceph osd pool rmsnap {pool_name} {snap}"
            self.rados_obj.node.shell([cmd])

            # Checking if snap was deleted successfully
            if self.check_snap_exists(snap_name=snap_name,
                                      pool_name=pool_name):
                log.error("Snapshot of pool exists")
                return False
            log.debug(f"deleted snapshot {snap} on pool {pool_name}")
        log.debug("Deleted provided snapshots on the pool")
        return True

    def get_bulk_details(self, pool_name: str) -> bool:
        """
        Checks the status of bulk flag on the pool given
        Args:
            pool_name: Name of the pool
        Returns: True -> pass, False -> fail

        """
        # Checking if the sent pool already exists.
        if pool_name not in self.rados_obj.list_pools():
            log.error(f"Pool {pool_name} does not exist")
            return False

        # Getting the bulk status
        obj = self.rados_obj.get_pool_property(pool=pool_name, props="bulk")
        return obj["bulk"]

    def set_bulk_flag(self, pool_name: str) -> bool:
        """
        Sets the bulk flag to true on existing pools
        Args:
            pool_name: Name of the pool
        Returns: True -> pass, False -> fail

        """
        # Checking if the sent pool already exists. If does not, creating new pool
        if pool_name not in self.rados_obj.list_pools():
            log.info(
                f"Pool {pool_name} does not exist, creating new pool with bulk enabled"
            )
            if not self.rados_obj.create_pool(pool_name=pool_name, bulk=True):
                log.error("Failed to create the replicated Pool")
                return False

        # Enabling bulk on already existing pool
        if not self.rados_obj.set_pool_property(
                pool=pool_name, props="bulk", value="true"):
            log.error(f"Could not set the bulk flag on pool {pool_name}")
            return False

        # Sleeping for 2 seconds after pool create/Modify for PG's to be calculated with bulk
        time.sleep(2)

        # Checking if the bulk is enabled or not
        return self.get_bulk_details(pool_name=pool_name)

    def rm_bulk_flag(self, pool_name: str) -> bool:
        """
        Removes the bulk flag on existing pools
        Args:
            pool_name: Name of the pool
        Returns: True -> pass, False -> fail

        """
        # Checking if the sent pool already exists.
        if pool_name not in self.rados_obj.list_pools():
            log.info(f"Pool {pool_name} does not exist")
            return False

        # Enabling bulk on already existing pool
        if not self.rados_obj.set_pool_property(
                pool=pool_name, props="bulk", value="false"):
            log.error(f"Could not unset the bulk flag on pool {pool_name}")
            return False

        # Sleeping for 2 seconds after pool create/Modify for PG's to be calculated with bulk
        time.sleep(2)

        # Checking if the bulk is enabled or not
        return not self.get_bulk_details(pool_name=pool_name)

    def get_target_pg_num_bulk_flag(self, pool_name: str) -> int:
        """
        Fetches the target PG counts for the given pool from the autoscaler status
        Args:
            pool_name: Name of the pool

        Returns: PG Count

        """
        # Checking the autoscaler status, final PG counts, bulk flags
        cmd = "ceph osd pool autoscale-status"
        pool_status = self.rados_obj.run_ceph_command(cmd=cmd)

        for entry in pool_status:
            if entry["pool_name"] == pool_name:
                return int(entry["pg_num_final"])
def run(ceph_cluster, **kw) -> int:
    """
    Test to copy data from one pool to another
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    pool_obj = PoolFunctions(node=cephadm)
    client_node = ceph_cluster.get_nodes(role="client")[0]

    pool_configs_path = "conf/pacific/rados/test-confs/pool-configurations.yaml"

    with open(pool_configs_path, "r") as fd:
        pool_configs = yaml.safe_load(fd)

    pool_orig = pool_configs[config["pool-1-type"]][config["pool-1-conf"]]
    pool_target = pool_configs[config["pool-2-type"]][config["pool-2-conf"]]
    create_given_pool(rados_obj, pool_orig)
    create_given_pool(rados_obj, pool_target)

    # Writing objects with huge omap entries
    if not pool_obj.fill_omap_entries(pool_name=pool_orig["pool_name"],
                                      obj_end=500):
        log.error(
            f"Omap entries not generated on pool {pool_orig['pool_name']}")
        return 1

    do_rados_put(mon=client_node, pool=pool_orig["pool_name"], nobj=1000)

    snapshots = []
    for _ in range(5):
        snap = pool_obj.create_pool_snap(pool_name=pool_orig["pool_name"])
        if snap:
            snapshots.append(snap)
        else:
            log.error("Could not create snapshot on the pool")
            return 1

    # Using cppool to copy contents b/w the pools
    cmd = f"rados cppool {pool_orig['pool_name']} {pool_target['pool_name']}"
    client_node.exec_command(sudo=True, cmd=cmd, long_running=True)

    # Sleeping for 2 seconds after copy to perform get operations
    time.sleep(2)

    do_rados_get(client_node, pool_target["pool_name"], 1)

    # Checking if the snapshots of pool was also copied
    # Snapshots of pool should not be copied
    for snap_name in snapshots:
        if pool_obj.check_snap_exists(snap_name=snap_name,
                                      pool_name=pool_target["pool_name"]):
            log.error("Snapshot of pool exists")
            return 1

    # deleting the Target pool created after cppool
    rados_obj.detete_pool(pool=pool_target["pool_name"])

    # Creating new target pool to test import/export
    create_given_pool(rados_obj, pool_target)

    # Creating temp file to hold pool info
    client_node.exec_command(cmd="touch /tmp/file", )

    # crating export of data on old pool
    cmd = f"rados export -p {pool_orig['pool_name']} /tmp/file"
    client_node.exec_command(sudo=True, cmd=cmd, long_running=True)

    # Importing the file into the new pool
    cmd = f"rados import -p {pool_target['pool_name']} /tmp/file"
    client_node.exec_command(sudo=True, cmd=cmd, long_running=True)

    # Sleeping for 2 seconds after copy to perform get operations
    time.sleep(2)

    do_rados_get(client_node, pool_target["pool_name"], 1)

    # Checking if the snapshots of pool was also copied
    # Snapshots of pool should not be copied
    for snap_name in snapshots:
        if pool_obj.check_snap_exists(snap_name=snap_name,
                                      pool_name=pool_target["pool_name"]):
            log.error("Snapshot of pool exists")
            return 1

    # deleting the Original & Target pool created after cppool
    rados_obj.detete_pool(pool=pool_target["pool_name"])
    rados_obj.detete_pool(pool=pool_orig["pool_name"])
    return 0
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """

    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonElectionStrategies(rados_obj=rados_obj)
    client_node = ceph_cluster.get_nodes(role="client")[0]

    site1_name = config["site1"]["name"]
    site2_name = config["site2"]["name"]

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    # Sleeping for 2 seconds after map update.
    time.sleep(2)

    # Setting the election strategy to connectivity mode
    if not mon_obj.set_election_strategy(mode="connectivity"):
        log.error("could not set election strategy to connectivity mode")
        return 1

    # Sleeping for 2 seconds after strategy update.
    time.sleep(2)

    # Checking updated election strategy in mon map
    strategy = mon_obj.get_election_strategy()
    if strategy != 3:
        log.error(
            f"cluster created election strategy other than connectivity, i.e {strategy}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    # Creating new datacenter crush objects and moving under root/default
    for name in [site1_name, site2_name]:
        cmd = f"ceph osd crush add-bucket {name} datacenter"
        rados_obj.run_ceph_command(cmd)
        time.sleep(2)
        move_crush_item(cephadm, crush_obj=name, name="root", value="default")
        time.sleep(2)

    # Moving all the OSD and Mon daemons into respective sites
    sites = ["site1", "site2", "site3"]
    for site in sites:
        mon_hosts = [
            host_obj.hostname
            for host_obj in ceph_cluster.get_nodes(role="mon")
        ]
        log.info(f"Mon hosts defined: {mon_hosts}")
        osd_hosts = [
            host_obj.hostname
            for host_obj in ceph_cluster.get_nodes(role="osd")
        ]
        log.info(f"OSD hosts defined: {osd_hosts}")
        # Collecting hosts from each site and setting locations accordingly
        site_details = config[site]
        crush_name = site_details["name"]
        host_nodes = cephadm.cluster.get_nodes()

        for item in site_details["hosts"]:
            host = [
                node for node in host_nodes if re.search(item, node.hostname)
            ][0]
            # Moving the mon daemons into site
            if host.hostname in mon_hosts:
                cmd = f"ceph mon set_location {host.hostname} datacenter={crush_name}"
                cephadm.shell([cmd])
                log.info(
                    f"Set location for mon {host.hostname} onto site {crush_name}\n"
                    "sleeping for 5 seconds")
                time.sleep(5)

            # Moving the osd daemons into site
            if host.hostname in osd_hosts:
                move_crush_item(
                    node=cephadm,
                    crush_obj=host.hostname,
                    name="datacenter",
                    value=crush_name,
                )
                log.info(
                    f"Set location for OSD {host.hostname} onto site {crush_name}\n"
                    "sleeping for 5 seconds")
                time.sleep(5)

    log.info("Moved all the hosts into respective sites")

    stretch_rule_name = config.get("stretch_rule_name", "stretch_rule")
    if not setup_crush_rule(
            node=client_node,
            rule_name=stretch_rule_name,
            site1=site1_name,
            site2=site2_name,
    ):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)

    # Enabling the stretch cluster mode
    tiebreaker_node = get_node_by_id(cephadm.cluster,
                                     config["site3"]["hosts"][0])
    log.info(f"tiebreaker node provided: {tiebreaker_node.hostname}")
    cmd = f"ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1
    time.sleep(2)

    # wait for PG's to settle down with new crush rules after deployment of stretch mode
    wait_for_clean_pg_sets(rados_obj)

    # Checking if the pools have been updated with the new crush rules
    acting_set = rados_obj.get_pg_acting_set()
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0
Beispiel #19
0
def run(ceph_cluster, **kw):
    """
    Test to Verify the pg-autoscale flag.
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)
    pool_configs_path = "conf/pacific/rados/test-confs/pool-configurations.yaml"

    regex = r"\s*(\d.\d)-rhel-\d"
    build = (re.search(regex, config.get("build",
                                         config.get("rhbuild")))).groups()[0]
    if not float(build) > 5.0:
        log.info(
            "Test running on version less than 5.1, skipping verifying autoscaler flags"
        )
        return 0

    # Setting the no-autoscale flag
    cmd = "ceph osd pool set noautoscale"
    rados_obj.run_ceph_command(cmd=cmd)

    # sleeping for 5 seconds as the command takes some time to affect the status of pools
    time.sleep(5)

    # Getting the autoscale configurations after setting the flag
    # all the pools should have autoscale set to off
    cmd = "ceph osd pool autoscale-status"
    pool_status = rados_obj.run_ceph_command(cmd=cmd)

    for entry in pool_status:
        if entry["pg_autoscale_mode"] == "on":
            log.error(
                f"Pg autoscaler not turned off for pool : {entry['pool_name']}"
            )
            return 1

    if not mon_obj.verify_set_config(section="global",
                                     name="osd_pool_default_pg_autoscale_mode",
                                     value="off"):
        log.error(
            "Default autoscale mode not set to off upon setting the no-autoscale flag"
        )
        return 1

    if not mon_obj.verify_set_config(
            section="mgr", name="mgr/pg_autoscaler/noautoscale", value="true"):
        log.error(
            "autoscale Flag not set to true upon setting the no-autoscale flag"
        )
        return 1

    # Creating a new pool, with the flag off, new pool should be created with autoscaler profile turned off
    with open(pool_configs_path, "r") as fd:
        pool_configs = yaml.safe_load(fd)

    pool_conf = pool_configs["replicated"]["sample-pool-2"]
    create_given_pool(rados_obj, pool_conf)

    cmd = "ceph osd pool autoscale-status"
    pool_status = rados_obj.run_ceph_command(cmd=cmd)

    for entry in pool_status:
        if entry["pool_name"] == pool_conf["pool_name"]:
            if entry["pg_autoscale_mode"] == "on":
                log.error(
                    f"Pg autoscaler not turned off for the new pool : {entry['pool_name']} "
                    f"created with flag turned off")
                return 1

    # Turning the autoscale flag back on. All the setting made earlier should be reverted
    cmd = "ceph osd pool unset noautoscale"
    pool_status = rados_obj.run_ceph_command(cmd=cmd)

    # sleeping for 5 seconds as the command takes some time to affect the status of pools
    time.sleep(5)

    for entry in pool_status:
        if entry["pg_autoscale_mode"] == "off":
            log.error(
                f"Pg autoscaler not turned on for pool : {entry['pool_name']}")
            return 1

    if not mon_obj.verify_set_config(section="global",
                                     name="osd_pool_default_pg_autoscale_mode",
                                     value="on"):
        log.error(
            "Default autoscale mode not set to true upon removing the no-autoscale flag"
        )
        return 1

    if not mon_obj.verify_set_config(section="mgr",
                                     name="mgr/pg_autoscaler/noautoscale",
                                     value="false"):
        log.error(
            "autoscale Flag not set to false upon removing the no-autoscale flag"
        )
        return 1

    # Deleting the pool created earlier
    if not rados_obj.detete_pool(pool=pool_conf["pool_name"]):
        log.error(f"the pool {pool_conf['pool_name']} could not be deleted")
        return 1

    log.info("Autoscale flag is working as expected.")
    return 0
def run(ceph_cluster, **kw):
    """
    Automates OSD re-balance test scenarios.
    1. Create replicated and/or erasure pool/pools
    2. Identify the first osd to be removed
    3. Fetch the host by daemon_type=osd and osd id
    4. Fetch container id and device path
    5. Mark osd out and wait for pgs to be active+clean
    6. Remove OSD
    7. Zap device and wait for device not present
    8. Identify the second osd to be removed
    9. Fetch the host by daemon_type=osd and osd id
    10. Fetch container id and device path
    11. Mark osd out
    12. Add first osd and wait for device present and pgs to be active+clean
    """
    try:
        log.info(run.__doc__)
        config = kw["config"]
        cephadm = CephAdmin(cluster=ceph_cluster, **config)
        rados_obj = RadosOrchestrator(node=cephadm)
        client_node = ceph_cluster.get_nodes(role="client")[0]

        log.info("Running osd in progress rebalance tests")
        pool = create_pools(config, rados_obj, client_node)
        should_not_be_empty(pool, "Failed to retrieve pool details")
        write_to_pools(config, rados_obj, client_node)
        rados_obj.change_recover_threads(config=pool, action="set")
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=pool["pool_name"])
        log.info(f"Acting set {acting_pg_set}")
        should_not_be_empty(acting_pg_set, "Failed to retrieve acting pg set")
        osd_id = acting_pg_set[0]
        host = rados_obj.fetch_host_node(daemon_type="osd", daemon_id=osd_id)
        should_not_be_empty(host, "Failed to fetch host details")
        dev_path = get_device_path(host, osd_id)
        log.debug(
            f"osd1 device path  : {dev_path}, osd_id : {osd_id}, host.hostname : {host.hostname}"
        )
        utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=True)
        method_should_succeed(utils.set_osd_out, ceph_cluster, osd_id)
        method_should_succeed(wait_for_clean_pg_sets, rados_obj)
        utils.osd_remove(ceph_cluster, osd_id)
        method_should_succeed(wait_for_clean_pg_sets, rados_obj)
        method_should_succeed(utils.zap_device, ceph_cluster, host.hostname,
                              dev_path)
        method_should_succeed(wait_for_device, host, osd_id, action="remove")
        osd_id1 = acting_pg_set[1]
        host1 = rados_obj.fetch_host_node(daemon_type="osd", daemon_id=osd_id1)
        should_not_be_empty(host1, "Failed to fetch host details")
        dev_path1 = get_device_path(host1, osd_id1)
        log.debug(
            f"osd2 device path  : {dev_path1}, osd_id : {osd_id1}, host.hostname : {host1.hostname}"
        )
        method_should_succeed(utils.set_osd_out, ceph_cluster, osd_id1)
        utils.add_osd(ceph_cluster, host.hostname, dev_path, osd_id)
        method_should_succeed(wait_for_device, host, osd_id, action="add")
        method_should_succeed(wait_for_clean_pg_sets, rados_obj)

        acting_pg_set1 = rados_obj.get_pg_acting_set(
            pool_name=pool["pool_name"])
        if len(acting_pg_set) != len(acting_pg_set1):
            log.error(
                f"Acting pg set count before {acting_pg_set} and after {acting_pg_set1} rebalance mismatched"
            )
            return 1

        if pool.get("rados_put", False):
            do_rados_get(client_node, pool["pool_name"], 1)
        utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=False)
        rados_obj.change_recover_threads(config=pool, action="rm")
        if config.get("delete_pools"):
            for name in config["delete_pools"]:
                method_should_succeed(rados_obj.detete_pool, name)
            log.info("deleted all the given pools successfully")
        return 0
    except Exception as e:
        log.info(e)
        log.info(traceback.format_exc())
        return 1
Beispiel #21
0
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Actions Performed:
    1. Disables the automatic crush map update
    2. Collects the OSD daemons in the cluster and split them into 2 sites.
    3. If add capacity is selected, only half of the OSD's will be added to various sites initially.
    4. Adds the stretch rule into crush map.
    5. Adding monitors into the 2 sites.
    6. Create a replicated pool and deploy stretch mode.
    7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites )
    8. Check for the bump in election epochs throughout.
    9. Check the acting set in PG for 4 OSD's. 2 from each site.
    Verifies bugs:
    [1]. https://bugzilla.redhat.com/show_bug.cgi?id=1937088
    [2]. https://bugzilla.redhat.com/show_bug.cgi?id=1952763
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonElectionStrategies(rados_obj=rados_obj)
    client_node = ceph_cluster.get_nodes(role="client")[0]
    tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0]

    if not client_node and not tiebreaker_node:
        log.error(
            "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster"
        )
        return 1
    mon_state = get_mon_details(node=cephadm)
    if len(list(mon_state["monitors"])) < 5:
        log.error(
            f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}"
        )
        return 1
    osd_details = get_osd_details(node=cephadm)
    if len(osd_details.keys()) < 4:
        log.error(
            f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}"
        )
        return 1

    # Finding and Deleting any stray EC pools that might have been left on cluster
    pool_dump = rados_obj.run_ceph_command(cmd="ceph osd dump")
    for entry in pool_dump["pools"]:
        if entry["type"] != 1 and entry["crush_rule"] != 0:
            log.info(
                f"A non-replicated pool found : {entry['pool_name']}, proceeding to delete pool"
            )
            if not rados_obj.detete_pool(pool=entry["pool_name"]):
                log.error(
                    f"the pool {entry['pool_name']} could not be deleted")
                return 1
        log.debug("No pools other than replicated found on cluster")

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    site1 = config.get("site1", "site1")
    site2 = config.get("site2", "site2")

    # Collecting osd details and split them into Sita A and Site B
    sorted_osds = sort_osd_sites(all_osd_details=osd_details)
    site_a_osds = sorted_osds[0]
    site_b_osds = sorted_osds[1]
    if config.get("perform_add_capacity"):
        site_a_osds = sorted_osds[0][:(len(sorted_osds[0]) // 2)]
        site_b_osds = sorted_osds[1][:(len(sorted_osds[1]) // 2)]

    if not set_osd_sites(
            node=cephadm,
            osds=site_a_osds,
            site=site1,
            all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    if not set_osd_sites(
            node=cephadm,
            osds=site_b_osds,
            site=site2,
            all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    # collecting mon map to be compared after strtech cluster deployment
    stretch_rule_name = "stretch_rule"
    if not setup_crush_rule(node=client_node,
                            rule_name=stretch_rule_name,
                            site1=site1,
                            site2=site2):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Setting the election strategy to connectivity mode
    if not mon_obj.set_election_strategy(mode="connectivity"):
        log.error("could not set election strategy to connectivity mode")
        return 1

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)
    init_mon_state = get_mon_details(node=cephadm)

    # Checking if mon elections happened after changing election strategy
    if mon_state["epoch"] > init_mon_state["epoch"]:
        log.error(
            "Election epoch not bumped up after setting the connectivity mode."
        )
        return 1

    # Checking updated election strategy in mon map
    strategy = mon_obj.get_election_strategy()
    if strategy != 3:
        log.error(
            f"cluster created election strategy other than connectivity, i.e {strategy}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    log.info(
        f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3")
    if not set_mon_sites(node=cephadm,
                         tiebreaker_node=tiebreaker_node,
                         site1=site1,
                         site2=site2):
        log.error("Failed to ad monitors into respective sites")
        return 1

    # All the existing pools should be automatically changed with stretch rule. Creating a test pool
    pool_name = "test_pool_1"
    if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
        log.error("Failed to create the replicated Pool")
        return 1

    log.info("Monitors added to respective sites. enabling stretch rule")
    cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1

    if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after Enabling strech mode")
        return 1

    if config.get("perform_add_capacity"):
        pool_name = "test_stretch_pool"
        if not rados_obj.create_pool(
                pool_name=pool_name,
                crush_rule=stretch_rule_name,
        ):
            log.error("Failed to create the replicated Pool")
            return 1
        do_rados_put(mon=client_node, pool=pool_name, nobj=1000)

        # Increasing backfill/rebalance threads so that cluster will re-balance it faster after add capacity
        rados_obj.change_recover_threads(config=config, action="set")

        log.info(
            "Performing add Capacity after the deployment of stretch cluster")
        site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds]
        site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds]

        if not set_osd_sites(
                node=cephadm,
                osds=site_a_osds,
                site=site1,
                all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1
        if not set_osd_sites(
                node=cephadm,
                osds=site_b_osds,
                site=site2,
                all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1

        # Waiting for up to 2.5 hours for the PG's to enter active + Clean state after add capacity
        # Automation for bug : [1] & [2]
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "remapped",
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "peering",
                    "recovering",
                    "recovery_wait",
                    "peering",
                    "undersized",
                    "backfilling_wait",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info(
                    "The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 2 minutes")
            time.sleep(120)
        rados_obj.change_recover_threads(config=config, action="rm")
        if not flag:
            log.error(
                "The cluster did not reach active + Clean state after add capacity"
            )
            return 1

        with parallel() as p:
            p.spawn(do_rados_get, client_node, pool_name, 10)
            for res in p:
                log.info(res)

    # Checking if the pools have been updated with the new crush rules
    acting_set = rados_obj.get_pg_acting_set(pool_name=pool_name)
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0