def run(ceph_cluster, **kw):
    """
    Automates OSD re-balance test scenarios.
    1. Create replicated and/or erasure pool/pools
    2. Identify the first osd to be removed
    3. Fetch the host by daemon_type=osd and osd id
    4. Fetch container id and device path
    5. Mark osd out and wait for pgs to be active+clean
    6. Remove OSD
    7. Zap device and wait for device not present
    8. Identify the second osd to be removed
    9. Fetch the host by daemon_type=osd and osd id
    10. Fetch container id and device path
    11. Mark osd out
    12. Add first osd and wait for device present and pgs to be active+clean
    """
    try:
        log.info(run.__doc__)
        config = kw["config"]
        cephadm = CephAdmin(cluster=ceph_cluster, **config)
        rados_obj = RadosOrchestrator(node=cephadm)
        client_node = ceph_cluster.get_nodes(role="client")[0]

        log.info("Running osd in progress rebalance tests")
        pool = create_pools(config, rados_obj, client_node)
        should_not_be_empty(pool, "Failed to retrieve pool details")
        write_to_pools(config, rados_obj, client_node)
        rados_obj.change_recover_threads(config=pool, action="set")
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=pool["pool_name"])
        log.info(f"Acting set {acting_pg_set}")
        should_not_be_empty(acting_pg_set, "Failed to retrieve acting pg set")
        osd_id = acting_pg_set[0]
        host = rados_obj.fetch_host_node(daemon_type="osd", daemon_id=osd_id)
        should_not_be_empty(host, "Failed to fetch host details")
        dev_path = get_device_path(host, osd_id)
        log.debug(
            f"osd1 device path  : {dev_path}, osd_id : {osd_id}, host.hostname : {host.hostname}"
        )
        utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=True)
        method_should_succeed(utils.set_osd_out, ceph_cluster, osd_id)
        method_should_succeed(wait_for_clean_pg_sets, rados_obj)
        utils.osd_remove(ceph_cluster, osd_id)
        method_should_succeed(wait_for_clean_pg_sets, rados_obj)
        method_should_succeed(utils.zap_device, ceph_cluster, host.hostname,
                              dev_path)
        method_should_succeed(wait_for_device, host, osd_id, action="remove")
        osd_id1 = acting_pg_set[1]
        host1 = rados_obj.fetch_host_node(daemon_type="osd", daemon_id=osd_id1)
        should_not_be_empty(host1, "Failed to fetch host details")
        dev_path1 = get_device_path(host1, osd_id1)
        log.debug(
            f"osd2 device path  : {dev_path1}, osd_id : {osd_id1}, host.hostname : {host1.hostname}"
        )
        method_should_succeed(utils.set_osd_out, ceph_cluster, osd_id1)
        utils.add_osd(ceph_cluster, host.hostname, dev_path, osd_id)
        method_should_succeed(wait_for_device, host, osd_id, action="add")
        method_should_succeed(wait_for_clean_pg_sets, rados_obj)

        acting_pg_set1 = rados_obj.get_pg_acting_set(
            pool_name=pool["pool_name"])
        if len(acting_pg_set) != len(acting_pg_set1):
            log.error(
                f"Acting pg set count before {acting_pg_set} and after {acting_pg_set1} rebalance mismatched"
            )
            return 1

        if pool.get("rados_put", False):
            do_rados_get(client_node, pool["pool_name"], 1)
        utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=False)
        rados_obj.change_recover_threads(config=pool, action="rm")
        if config.get("delete_pools"):
            for name in config["delete_pools"]:
                method_should_succeed(rados_obj.detete_pool, name)
            log.info("deleted all the given pools successfully")
        return 0
    except Exception as e:
        log.info(e)
        log.info(traceback.format_exc())
        return 1
예제 #2
0
def run(ceph_cluster, **kw):
    """
    Automates OSD re-balance test scenarios.
    1. Create replicated and/or erasure pool/pools
    2. Identify the osd to be removed
    3. Fetch the host by daemon_type=osd and osd id
    4. Fetch container id and device path
    5. Mark osd out and wait for pgs to be active+clean
    6. Remove OSD
    7. Zap device and wait for device not present
    8. Add OSD and wait for device present and pgs to be active+clean
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    client_node = ceph_cluster.get_nodes(role="client")[0]

    log.info("Running create pool test case")
    if config.get("create_pools"):
        pools = config.get("create_pools")
        for each_pool in pools:
            cr_pool = each_pool["create_pool"]
            if cr_pool.get("pool_type", "replicated") == "erasure":
                method_should_succeed(rados_obj.create_erasure_pool,
                                      name=cr_pool["pool_name"],
                                      **cr_pool)
            else:
                method_should_succeed(rados_obj.create_pool,
                                      pool_name=cr_pool["pool_name"],
                                      **cr_pool)
            method_should_succeed(rados_obj.bench_write, **cr_pool)
        pool = random.choice(pools)["create_pool"]
    if not pool:
        log.error("Failed to retrieve pool details")
        return 1

    rados_obj.change_recover_threads(config=pool, action="set")
    acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool["pool_name"])
    log.info(f"Acting set {acting_pg_set}")
    if not acting_pg_set:
        log.error("Failed to retrieve acting pg set")
        return 1
    osd_id = acting_pg_set[0]
    host = rados_obj.fetch_host_node(daemon_type="osd", daemon_id=osd_id)
    if not host:
        log.error("Failed to fetch host details")
        return 1
    # fetch container id
    out, _ = host.exec_command(sudo=True, cmd="podman ps --format json")
    container_id = [
        item["Names"][0] for item in json.loads(out.read().decode())
        if f"osd.{osd_id}" in item["Command"]
    ][0]
    if not container_id:
        log.error("Failed to retrieve container id")
        return 1
    # fetch device path by osd_id
    vol_out, _ = host.exec_command(
        sudo=True,
        cmd=f"podman exec {container_id} ceph-volume lvm list --format json",
    )
    volume_out = vol_out.read().decode()
    dev_path = [
        v[0]["devices"][0] for k, v in json.loads(volume_out).items()
        if str(k) == str(osd_id)
    ][0]
    if not dev_path:
        log.error("Failed to get device path")
        return 1
    log.debug(
        f"device path  : {dev_path}, osd_id : {osd_id}, host.hostname : {host.hostname}"
    )
    utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=True)
    method_should_succeed(utils.set_osd_out, ceph_cluster, osd_id)
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    utils.osd_remove(ceph_cluster, osd_id)
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    method_should_succeed(utils.zap_device, ceph_cluster, host.hostname,
                          dev_path)
    method_should_succeed(wait_for_device,
                          host,
                          container_id,
                          osd_id,
                          action="remove")
    utils.add_osd(ceph_cluster, host.hostname, dev_path, osd_id)
    method_should_succeed(wait_for_device,
                          host,
                          container_id,
                          osd_id,
                          action="add")
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    do_rados_put(mon=client_node, pool=pool["pool_name"], nobj=1000)
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=False)
    rados_obj.change_recover_threads(config=pool, action="rm")

    if config.get("delete_pools"):
        for name in config["delete_pools"]:
            method_should_succeed(rados_obj.detete_pool, name)
        log.info("deleted all the given pools successfully")

    return 0
예제 #3
0
def run(ceph_cluster, **kw):
    """
    Performs various pool related validation tests
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)

    if config.get("ec_pool_recovery_improvement"):
        ec_config = config.get("ec_pool_recovery_improvement")
        if not rados_obj.create_erasure_pool(name="recovery", **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if not rados_obj.bench_write(**ec_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**ec_config)
        log.info("Created the EC Pool, Finished writing data into the pool")

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        log.info(
            f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        log.info("Stopped 'm' number of OSD's from, starting to wait for recovery")
        rados_obj.change_recover_threads(config=ec_config, action="set")

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "recovering",
                    "recovery_wait",
                    "backfilling_wait",
                    "peered",
                    "undersized",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info("The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 1 minute"
            )
            time.sleep(60)

        # getting the acting set for the created pool after recovery
        acting_pg_set = rados_obj.get_pg_acting_set(pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        # Changing recovery threads back to default
        rados_obj.change_recover_threads(config=ec_config, action="rm")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # Sleep for 5 seconds for OSD's to join the cluster
        time.sleep(5)

        if not flag:
            log.error("The pool did not reach active + Clean state after recovery")
            return 1

        # Deleting the pool created
        if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
            log.error(f"the pool {ec_config['pool_name']} could not be deleted")
            return 1

        log.info("Successfully tested EC pool recovery with K osd's surviving")
        return 0
예제 #4
0
def run(ceph_cluster, **kw):
    """
    Performs various pool related validation tests
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)
    pool_obj = PoolFunctions(node=cephadm)

    if config.get("ec_pool_recovery_improvement"):
        ec_config = config.get("ec_pool_recovery_improvement")
        if not rados_obj.create_erasure_pool(name="recovery", **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if not rados_obj.bench_write(**ec_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**ec_config)
        log.info("Created the EC Pool, Finished writing data into the pool")

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        log.info(
            f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        log.info(
            "Stopped 'm' number of OSD's from, starting to wait for recovery")
        rados_obj.change_recover_threads(config=ec_config, action="set")

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "recovering",
                    "recovery_wait",
                    "backfilling_wait",
                    "peered",
                    "undersized",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info(
                    "The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 1 minute")
            time.sleep(60)

        # getting the acting set for the created pool after recovery
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        # Changing recovery threads back to default
        rados_obj.change_recover_threads(config=ec_config, action="rm")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # Sleep for 5 seconds for OSD's to join the cluster
        time.sleep(5)

        if not flag:
            log.error(
                "The pool did not reach active + Clean state after recovery")
            return 1

        # Deleting the pool created
        if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
            log.error(
                f"the pool {ec_config['pool_name']} could not be deleted")
            return 1

        log.info("Successfully tested EC pool recovery with K osd's surviving")
        return 0

    if config.get("Compression_tests"):
        """
        Create a 2 replicated pools:
        1. Pool_1 : enable any compression algorithm(def snappy) and compression mode(aggressive/force).
        2. Pool_2 : set compression mode to none
        Writing the same amount of data on 2 pools, size of pool with compression on would consume less space
        """
        pool_config = config["Compression_tests"]["pool_config"]
        compression_config = config["Compression_tests"]["compression_config"]
        pool_1 = pool_config["pool-1"]
        pool_2 = pool_config["pool-2"]

        if config["Compression_tests"]["pool_type"] == "replicated":
            if not rados_obj.create_pool(pool_name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            if not rados_obj.create_pool(pool_name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
        elif config["Compression_tests"]["pool_type"] == "erasure":
            pool_config["pool_name"] = pool_1
            if not rados_obj.create_erasure_pool(name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            pool_config["pool_name"] = pool_2
            if not rados_obj.create_erasure_pool(name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
            del pool_config["pool_name"]

        log.debug("Created two pools to test compression")

        # Enabling compression on pool-1
        if not rados_obj.pool_inline_compression(pool_name=pool_1,
                                                 **compression_config):
            log.error(
                f"Error setting compression on pool : {pool_1} for config {compression_config}"
            )
            return 1

        # Writing the same amount of data into two pools
        if not rados_obj.bench_write(pool_name=pool_1, **pool_config):
            log.error(
                "Failed to write objects into Pool-1, with compression enabled"
            )
            return 1

        if not rados_obj.bench_write(pool_name=pool_2, **pool_config):
            log.error(
                "Failed to write objects into Pool-2, without compression enabled"
            )
            return 1
        # Sleeping for 5 seconds for status to be updated.
        time.sleep(5)

        log.debug(
            "Finished writing data into the two pools. Checking pool stats")
        try:
            pool_stats = rados_obj.run_ceph_command(
                cmd="ceph df detail")["pools"]
            pool_1_stats = [
                detail for detail in pool_stats if detail["name"] == pool_1
            ][0]["stats"]
            pool_2_stats = [
                detail for detail in pool_stats if detail["name"] == pool_2
            ][0]["stats"]
        except KeyError:
            log.error(
                "No stats about the pools requested found on the cluster")
            return 1

        log.debug(f"Pool-1 stats: {pool_1_stats}")
        log.debug(f"Pool-2 stats: {pool_2_stats}")
        if pool_1_stats["compress_bytes_used"] < 0:
            log.error("No data stored under pool-1 is compressed")
            return 1

        if pool_1_stats["kb_used"] >= pool_2_stats["kb_used"]:
            log.error("Compression has no effect on the pool size...")
            return 1

        if config["Compression_tests"].get("verify_compression_ratio_set"):
            # added verification for test: CEPH-83571672
            if not rados_obj.check_compression_size(pool_name=pool_1,
                                                    **compression_config):
                log.error("data not compressed in accordance to ratio set")
                return 1

        log.info("Pool size is less when compression is enabled")
        return 0

    if config.get("test_autoscaler_bulk_feature"):
        """
        Tests to verify the autoscaler bulk flag, which allows pools to make use of
        scale-down profile, making those pools start with full compliments of PG sets.
        Tests include
        1. creating new pools with bulk,
        2. enabling/disabling bulk flag on existing pools
        3. Verify the PG changes when the flag is set/unset
        Verifies bugs : https://bugzilla.redhat.com/show_bug.cgi?id=2049851
        """
        regex = r"\s*(\d.\d)-rhel-\d"
        build = (re.search(regex,
                           config.get("build",
                                      config.get("rhbuild")))).groups()[0]
        if not float(build) > 5.0:
            log.info(
                "Test running on version less than 5.1, skipping verifying bulk flags"
            )
            return 0

        # Creating a pool with bulk feature
        pool_name = config.get("pool_name")
        if not pool_obj.set_bulk_flag(pool_name=pool_name):
            log.error("Failed to create a pool with bulk features")
            return 1

        # Checking the autoscaler status, final PG counts, bulk flags
        pg_target_init = pool_obj.get_target_pg_num_bulk_flag(
            pool_name=pool_name)

        # Unsetting the bulk flag and checking the change in the PG counts
        if not pool_obj.rm_bulk_flag(pool_name=pool_name):
            log.error("Failed to create a pool with bulk features")
            return 1

        # Sleeping for 5 seconds for new PG num to bets et
        time.sleep(5)
        pg_target_interim = pool_obj.get_target_pg_num_bulk_flag(
            pool_name=pool_name)

        # The target PG's once the flag is disabled must be lesser than when enabled
        if pg_target_interim >= pg_target_init:
            log.error("PG's not reduced after bulk flag disabled")
            return 1

        # Setting the bulk flag on pool again and checking the change in the PG counts
        if not pool_obj.set_bulk_flag(pool_name=pool_name):
            log.error("Failed to disable/remove bulk features on pool")
            return 1

        # Sleeping for 5 seconds for new PG num to bets et
        time.sleep(5)

        pg_target_final = pool_obj.get_target_pg_num_bulk_flag(
            pool_name=pool_name)

        # The target PG's once the flag is disabled must be lesser than when enabled
        if pg_target_interim >= pg_target_final:
            log.error("PG's not Increased after bulk flag Enabled")
            return 1

        if config.get("delete_pool"):
            rados_obj.detete_pool(pool=pool_name)
        log.info("Verified the workings of bulk flag")
        return 0

    if config.get("verify_pool_target_ratio"):
        log.debug("Verifying target size ratio on pools")
        target_configs = config["verify_pool_target_ratio"]["configurations"]
        # Creating pools and starting the test
        for entry in target_configs.values():
            log.debug(f"Creating {entry['pool_type']} pool on the cluster")
            if entry.get("pool_type", "replicated") == "erasure":
                method_should_succeed(rados_obj.create_erasure_pool,
                                      name=entry["pool_name"],
                                      **entry)
            else:
                method_should_succeed(
                    rados_obj.create_pool,
                    **entry,
                )
            rados_obj.bench_write(**entry)
            if not pool_obj.verify_target_ratio_set(
                    pool_name=entry["pool_name"],
                    ratio=entry["target_size_ratio"]):
                log.error(
                    f"Could not change the target ratio on the pool: {entry['pool_name']}"
                )
                return 1
            log.debug("Set the ratio. getting the projected pg's")

            rados_obj.change_recover_threads(config=config, action="set")
            log.debug(
                "Waiting for the rebalancing to complete on the cluster after the change"
            )
            # Sleeping for 2 minutes for rebalancing to start & for new PG count to be updated.
            time.sleep(120)

            new_pg_count = int(
                pool_obj.get_pg_autoscaler_value(pool_name=entry["pool_name"],
                                                 item="pg_num_target"))
            if new_pg_count <= entry["pg_num"]:
                log.error(
                    f"Count of PG's not increased on the pool: {entry['pool_name']}"
                    f"Initial creation count : {entry['pg_num']}"
                    f"New count after setting num target : {new_pg_count}")
                return 1

            res = wait_for_clean_pg_sets(rados_obj)
            if not res:
                log.error(
                    "PG's in cluster are not active + Clean after the ratio change"
                )
                return 1
            if not pool_obj.verify_target_ratio_set(
                    pool_name=entry["pool_name"], ratio=0.0):
                log.error(
                    f"Could not remove the target ratio on the pool: {entry['pool_name']}"
                )
                return 1

            # Sleeping for 2 minutes for rebalancing to start & for new PG count to be updated.
            time.sleep(120)
            # Checking if after the removal of ratio, the PG count has reduced
            end_pg_count = int(
                pool_obj.get_pg_autoscaler_value(pool_name=entry["pool_name"],
                                                 item="pg_num_target"))
            if end_pg_count >= new_pg_count:
                log.error(
                    f"Count of PG's not changed/ reverted on the pool: {entry['pool_name']}"
                    f" after removing the target ratios")
                return 1
            rados_obj.change_recover_threads(config=config, action="rm")
            if entry.get("delete_pool", False):
                rados_obj.detete_pool(pool=entry["pool_name"])
            log.info(
                f"Completed the test of target ratio on pool: {entry['pool_name']} "
            )
        log.info("Target ratio tests completed")
        return 0

    if config.get("verify_mon_target_pg_per_osd"):
        pg_conf = config.get("verify_mon_target_pg_per_osd")
        if not mon_obj.set_config(**pg_conf):
            log.error("Could not set the value for mon_target_pg_per_osd ")
            return 1
        mon_obj.remove_config(**pg_conf)
        log.info("Set and verified the value for mon_target_pg_per_osd ")
        return 0

    if config.get("verify_pg_num_min"):
        log.debug("Verifying pg_num_min on pools")
        target_configs = config["verify_pg_num_min"]["configurations"]
        # Creating pools and starting the test
        for entry in target_configs.values():
            log.debug(f"Creating {entry['pool_type']} pool on the cluster")
            if entry.get("pool_type", "replicated") == "erasure":
                method_should_succeed(rados_obj.create_erasure_pool,
                                      name=entry["pool_name"],
                                      **entry)
            else:
                method_should_succeed(
                    rados_obj.create_pool,
                    **entry,
                )
            rados_obj.bench_write(**entry)

            if not rados_obj.set_pool_property(pool=entry["pool_name"],
                                               props="pg_num_min",
                                               value=entry["pg_num_min"]):
                log.error("Could not set the pg_min_size on the pool")
                return 1

            if entry.get("delete_pool", False):
                rados_obj.detete_pool(pool=entry["pool_name"])
            log.info(
                f"Completed the test of pg_min_num on pool: {entry['pool_name']} "
            )
        log.info("pg_min_num tests completed")
        return 0
예제 #5
0
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Actions Performed:
    1. Disables the automatic crush map update
    2. Collects the OSD daemons in the cluster and split them into 2 sites.
    3. If add capacity is selected, only half of the OSD's will be added to various sites initially.
    4. Adds the stretch rule into crush map.
    5. Adding monitors into the 2 sites.
    6. Create a replicated pool and deploy stretch mode.
    7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites )
    8. Check for the bump in election epochs throughout.
    9. Check the acting set in PG for 4 OSD's. 2 from each site.
    Verifies bugs:
    [1]. https://bugzilla.redhat.com/show_bug.cgi?id=1937088
    [2]. https://bugzilla.redhat.com/show_bug.cgi?id=1952763
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonElectionStrategies(rados_obj=rados_obj)
    client_node = ceph_cluster.get_nodes(role="client")[0]
    tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0]

    if not client_node and not tiebreaker_node:
        log.error(
            "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster"
        )
        return 1
    mon_state = get_mon_details(node=cephadm)
    if len(list(mon_state["monitors"])) < 5:
        log.error(
            f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}"
        )
        return 1
    osd_details = get_osd_details(node=cephadm)
    if len(osd_details.keys()) < 4:
        log.error(
            f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}"
        )
        return 1

    if config.get("verify_forced_recovery"):
        log.info("Verifying forced recovery and healthy in stretch environment")

        pool_name = "stretch_pool_recovery"
        if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
            log.error("Failed to create the replicated Pool")
            return 1

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool_name)

        log.info(
            f"Killing 2 OSD's from acting set : {acting_pg_set} to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(2)]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        log.info("Stopped 2 OSD's from acting set, starting to wait for recovery")
        rados_obj.change_recover_threads(config=config, action="set")

        if not rados_obj.bench_write(pool_name=pool_name, **config):
            log.error("Failed to write objects into the Pool")
            return 1

        log.debug("Triggering forced recovery in stretch mode")
        cmd = "ceph osd force_recovery_stretch_mode --yes-i-really-mean-it"
        rados_obj.run_ceph_command(cmd)
        log.info("Triggered the recovery in stretch mode")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # there was data written into pool when the OSD's were down.
        # Verifying if data is recovered and placed into the OSD's after bringing them back
        res = wait_for_clean_pg_sets(rados_obj)
        if not res:
            log.error("PG's in cluster are not active + Clean ")
            return 1

        log.debug("Forcing the stretch cluster into healthy mode")
        cmd = "ceph osd force_healthy_stretch_mode --yes-i-really-mean-it"
        rados_obj.run_ceph_command(cmd)

        log.info("Cluster has successfully recovered and is in healthy state")
        return 0

    # Finding and Deleting any stray EC pools that might have been left on cluster
    pool_dump = rados_obj.run_ceph_command(cmd="ceph osd dump")
    for entry in pool_dump["pools"]:
        if entry["type"] != 1 and entry["crush_rule"] != 0:
            log.info(
                f"A non-replicated pool found : {entry['pool_name']}, proceeding to delete pool"
            )
            if not rados_obj.detete_pool(pool=entry["pool_name"]):
                log.error(f"the pool {entry['pool_name']} could not be deleted")
                return 1
        log.debug("No pools other than replicated found on cluster")

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    site1 = config.get("site1", "site1")
    site2 = config.get("site2", "site2")

    # Collecting osd details and split them into Sita A and Site B
    sorted_osds = sort_osd_sites(all_osd_details=osd_details)
    site_a_osds = sorted_osds[0]
    site_b_osds = sorted_osds[1]
    if config.get("perform_add_capacity"):
        site_a_osds = sorted_osds[0][: (len(sorted_osds[0]) // 2)]
        site_b_osds = sorted_osds[1][: (len(sorted_osds[1]) // 2)]

    if not set_osd_sites(
        node=cephadm,
        osds=site_a_osds,
        site=site1,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    if not set_osd_sites(
        node=cephadm,
        osds=site_b_osds,
        site=site2,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    # collecting mon map to be compared after strtech cluster deployment
    stretch_rule_name = "stretch_rule"
    if not setup_crush_rule(
        node=client_node, rule_name=stretch_rule_name, site1=site1, site2=site2
    ):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Setting the election strategy to connectivity mode
    if not mon_obj.set_election_strategy(mode="connectivity"):
        log.error("could not set election strategy to connectivity mode")
        return 1

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)
    init_mon_state = get_mon_details(node=cephadm)

    # Checking if mon elections happened after changing election strategy
    if mon_state["epoch"] > init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after setting the connectivity mode.")
        return 1

    # Checking updated election strategy in mon map
    strategy = mon_obj.get_election_strategy()
    if strategy != 3:
        log.error(
            f"cluster created election strategy other than connectivity, i.e {strategy}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    log.info(f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3")
    if not set_mon_sites(
        node=cephadm, tiebreaker_node=tiebreaker_node, site1=site1, site2=site2
    ):
        log.error("Failed to ad monitors into respective sites")
        return 1

    # All the existing pools should be automatically changed with stretch rule. Creating a test pool
    pool_name = "test_pool_1"
    if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
        log.error("Failed to create the replicated Pool")
        return 1

    log.info("Monitors added to respective sites. enabling stretch rule")
    cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1

    if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after Enabling strech mode")
        return 1

    # Increasing backfill/rebalance threads so that cluster will re-balance it faster
    rados_obj.change_recover_threads(config=config, action="set")

    # wait for active + clean after deployment of stretch mode
    # checking the state after deployment coz of BZ : https://bugzilla.redhat.com/show_bug.cgi?id=2025800
    res = wait_for_clean_pg_sets(rados_obj)
    if not res:
        status_report = rados_obj.run_ceph_command(cmd="ceph report")
        # Proceeding to check if all PG's are in active + clean
        for entry in status_report["num_pg_by_state"]:
            rec = ("remapped", "peering")
            if any(key in rec for key in entry["state"].split("+")):
                log.error(
                    "PG's in cluster are stuck in remapped+peering after stretch deployment."
                )
                return 1

    if config.get("perform_add_capacity"):
        pool_name = "test_stretch_pool"
        if not rados_obj.create_pool(
            pool_name=pool_name,
            crush_rule=stretch_rule_name,
        ):
            log.error("Failed to create the replicated Pool")
            return 1
        do_rados_put(mon=client_node, pool=pool_name, nobj=100)

        log.info("Performing add Capacity after the deployment of stretch cluster")
        site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds]
        site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds]

        if not set_osd_sites(
            node=cephadm,
            osds=site_a_osds,
            site=site1,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1
        if not set_osd_sites(
            node=cephadm,
            osds=site_b_osds,
            site=site2,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1

        flag = wait_for_clean_pg_sets(rados_obj)
        if not flag:
            log.error(
                "The cluster did not reach active + Clean state after add capacity"
            )
            return 1

        with parallel() as p:
            p.spawn(do_rados_get, client_node, pool_name, 10)
            for res in p:
                log.info(res)
        log.info("Successfully completed Add Capacity scenario")

    rados_obj.change_recover_threads(config=config, action="rm")

    # Checking if the pools have been updated with the new crush rules
    acting_set = rados_obj.get_pg_acting_set(pool_name=pool_name)
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0
예제 #6
0
def run(ceph_cluster, **kw):
    """
    Test to create a large number of omap entries on the single PG pool and test osd resiliency
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    pool_obj = PoolFunctions(node=cephadm)

    pool_target_configs = config["verify_osd_omap_entries"]["configurations"]
    omap_target_configs = config["verify_osd_omap_entries"]["omap_config"]

    # Creating pools and starting the test
    for entry in pool_target_configs.values():
        log.debug(
            f"Creating {entry['pool_type']} pool on the cluster with name {entry['pool_name']}"
        )
        if entry.get("pool_type", "replicated") == "erasure":
            method_should_succeed(rados_obj.create_erasure_pool,
                                  name=entry["pool_name"],
                                  **entry)
        else:
            method_should_succeed(
                rados_obj.create_pool,
                **entry,
            )

        log.debug(
            "Created the pool. beginning to create large number of omap entries on the pool"
        )
        if not pool_obj.fill_omap_entries(pool_name=entry["pool_name"],
                                          **omap_target_configs):
            log.error(
                f"Omap entries not generated on pool {entry['pool_name']}")
            return 1

        # Fetching the current acting set for the pool
        acting_set = rados_obj.get_pg_acting_set(pool_name=entry["pool_name"])
        rados_obj.change_recover_threads(config={}, action="set")
        log.debug(
            f"Proceeding to restart OSd's from the acting set {acting_set}")
        for osd_id in acting_set:
            rados_obj.change_osd_state(action="stop", target=osd_id)
            # sleeping for 5 seconds for re-balancing to begin
            time.sleep(5)

            # Waiting for cluster to get clean state after OSD stopped
            if not wait_for_clean_pg_sets(rados_obj):
                log.error("PG's in cluster are not active + Clean state.. ")
                return 1
            rados_obj.change_osd_state(action="restart", target=osd_id)
            log.debug(
                f"Cluster reached clean state after osd {osd_id} stop and restart"
            )

        rados_obj.change_recover_threads(config={}, action="rm")
        # deleting the pool created after the test
        rados_obj.detete_pool(pool=entry["pool_name"])

        log.info(
            f"All the OSD's from the acting set {acting_set} were restarted "
            f"and object movement completed for pool {entry['pool_name']}")

    log.info(
        "Completed testing effects of large number of omap entries on pools ")
    return 0
예제 #7
0
def run(ceph_cluster, **kw):
    """
    Performs various pool related validation tests
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)

    if config.get("ec_pool_recovery_improvement"):
        ec_config = config.get("ec_pool_recovery_improvement")
        if not rados_obj.create_erasure_pool(name="recovery", **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if not rados_obj.bench_write(**ec_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**ec_config)
        log.info("Created the EC Pool, Finished writing data into the pool")

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        log.info(
            f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        log.info(
            "Stopped 'm' number of OSD's from, starting to wait for recovery")
        rados_obj.change_recover_threads(config=ec_config, action="set")

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "recovering",
                    "recovery_wait",
                    "backfilling_wait",
                    "peered",
                    "undersized",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info(
                    "The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 1 minute")
            time.sleep(60)

        # getting the acting set for the created pool after recovery
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        # Changing recovery threads back to default
        rados_obj.change_recover_threads(config=ec_config, action="rm")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # Sleep for 5 seconds for OSD's to join the cluster
        time.sleep(5)

        if not flag:
            log.error(
                "The pool did not reach active + Clean state after recovery")
            return 1

        # Deleting the pool created
        if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
            log.error(
                f"the pool {ec_config['pool_name']} could not be deleted")
            return 1

        log.info("Successfully tested EC pool recovery with K osd's surviving")
        return 0

    if config.get("Compression_tests"):
        """
        Create a 2 replicated pools:
        1. Pool_1 : enable any compression algorithm(def snappy) and compression mode(aggressive/force).
        2. Pool_2 : set compression mode to none
        Writing the same amount of data on 2 pools, size of pool with compression on would consume less space
        """
        pool_config = config["Compression_tests"]["pool_config"]
        compression_config = config["Compression_tests"]["compression_config"]
        pool_1 = pool_config["pool-1"]
        pool_2 = pool_config["pool-2"]

        if config["Compression_tests"]["pool_type"] == "replicated":
            if not rados_obj.create_pool(pool_name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            if not rados_obj.create_pool(pool_name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
        elif config["Compression_tests"]["pool_type"] == "erasure":
            pool_config["pool_name"] = pool_1
            if not rados_obj.create_erasure_pool(name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            pool_config["pool_name"] = pool_2
            if not rados_obj.create_erasure_pool(name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
            del pool_config["pool_name"]

        log.debug("Created two pools to test compression")

        # Enabling compression on pool-1
        if not rados_obj.pool_inline_compression(pool_name=pool_1,
                                                 **compression_config):
            log.error(
                f"Error setting compression on pool : {pool_1} for config {compression_config}"
            )
            return 1

        # Writing the same amount of data into two pools
        if not rados_obj.bench_write(pool_name=pool_1, **pool_config):
            log.error(
                "Failed to write objects into Pool-1, with compression enabled"
            )
            return 1

        if not rados_obj.bench_write(pool_name=pool_2, **pool_config):
            log.error(
                "Failed to write objects into Pool-2, without compression enabled"
            )
            return 1
        # Sleeping for 5 seconds for status to be updated.
        time.sleep(5)

        log.debug(
            "Finished writing data into the two pools. Checking pool stats")
        try:
            pool_stats = rados_obj.run_ceph_command(
                cmd="ceph df detail")["pools"]
            pool_1_stats = [
                detail for detail in pool_stats if detail["name"] == pool_1
            ][0]["stats"]
            pool_2_stats = [
                detail for detail in pool_stats if detail["name"] == pool_2
            ][0]["stats"]
        except KeyError:
            log.error(
                "No stats about the pools requested found on the cluster")
            return 1

        log.debug(f"Pool-1 stats: {pool_1_stats}")
        log.debug(f"Pool-2 stats: {pool_2_stats}")
        if pool_1_stats["compress_bytes_used"] < 0:
            log.error("No data stored under pool-1 is compressed")
            return 1

        if pool_1_stats["kb_used"] >= pool_2_stats["kb_used"]:
            log.error("Compression has no effect on the pool size...")
            return 1

        if config["Compression_tests"].get("verify_compression_ratio_set"):
            # added verification for test: CEPH-83571672
            if not rados_obj.check_compression_size(pool_name=pool_1,
                                                    **compression_config):
                log.error("data not compressed in accordance to ratio set")
                return 1

        log.info("Pool size is less when compression is enabled")
        return 0

    if config.get("check_autoscaler_profile"):
        """
        Verifies that the default auto-scaler profile on 5.1 builds in scale-up
        Verifies bugs : 1. https://bugzilla.redhat.com/show_bug.cgi?id=2021738
        """
        build = config.get("build", config.get("rhbuild"))
        autoscale_conf = config.get("check_autoscaler_profile")
        regex = r"5.1-rhel-\d{1}"
        if re.search(regex, build):
            log.info(
                "Test running on 5.1 builds, checking the default autoscaler profile"
            )
            if not mon_obj.verify_set_config(**autoscale_conf):
                log.error(
                    f"The default value for autoscaler profile is not scale-up in buld {build}"
                )
                return 1
            log.info(f"Autoscale profile is scale-up in release : {build}")
        else:
            log.debug(
                f"The profile is already scale-up by default in release : {build}"
            )
        return 0
예제 #8
0
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Actions Performed:
    1. Disables the automatic crush map update
    2. Collects the OSD daemons in the cluster and split them into 2 sites.
    3. If add capacity is selected, only half of the OSD's will be added to various sites initially.
    4. Adds the stretch rule into crush map.
    5. Adding monitors into the 2 sites.
    6. Create a replicated pool and deploy stretch mode.
    7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites )
    8. Check for the bump in election epochs throughout.
    9. Check the acting set in PG for 4 OSD's. 2 from each site.
    Verifies bugs:
    [1]. https://bugzilla.redhat.com/show_bug.cgi?id=1937088
    [2]. https://bugzilla.redhat.com/show_bug.cgi?id=1952763
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonElectionStrategies(rados_obj=rados_obj)
    client_node = ceph_cluster.get_nodes(role="client")[0]
    tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0]

    if not client_node and not tiebreaker_node:
        log.error(
            "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster"
        )
        return 1
    mon_state = get_mon_details(node=cephadm)
    if len(list(mon_state["monitors"])) < 5:
        log.error(
            f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}"
        )
        return 1
    osd_details = get_osd_details(node=cephadm)
    if len(osd_details.keys()) < 4:
        log.error(
            f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}"
        )
        return 1

    # Finding and Deleting any stray EC pools that might have been left on cluster
    pool_dump = rados_obj.run_ceph_command(cmd="ceph osd dump")
    for entry in pool_dump["pools"]:
        if entry["type"] != 1 and entry["crush_rule"] != 0:
            log.info(
                f"A non-replicated pool found : {entry['pool_name']}, proceeding to delete pool"
            )
            if not rados_obj.detete_pool(pool=entry["pool_name"]):
                log.error(
                    f"the pool {entry['pool_name']} could not be deleted")
                return 1
        log.debug("No pools other than replicated found on cluster")

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    site1 = config.get("site1", "site1")
    site2 = config.get("site2", "site2")

    # Collecting osd details and split them into Sita A and Site B
    sorted_osds = sort_osd_sites(all_osd_details=osd_details)
    site_a_osds = sorted_osds[0]
    site_b_osds = sorted_osds[1]
    if config.get("perform_add_capacity"):
        site_a_osds = sorted_osds[0][:(len(sorted_osds[0]) // 2)]
        site_b_osds = sorted_osds[1][:(len(sorted_osds[1]) // 2)]

    if not set_osd_sites(
            node=cephadm,
            osds=site_a_osds,
            site=site1,
            all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    if not set_osd_sites(
            node=cephadm,
            osds=site_b_osds,
            site=site2,
            all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    # collecting mon map to be compared after strtech cluster deployment
    stretch_rule_name = "stretch_rule"
    if not setup_crush_rule(node=client_node,
                            rule_name=stretch_rule_name,
                            site1=site1,
                            site2=site2):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Setting the election strategy to connectivity mode
    if not mon_obj.set_election_strategy(mode="connectivity"):
        log.error("could not set election strategy to connectivity mode")
        return 1

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)
    init_mon_state = get_mon_details(node=cephadm)

    # Checking if mon elections happened after changing election strategy
    if mon_state["epoch"] > init_mon_state["epoch"]:
        log.error(
            "Election epoch not bumped up after setting the connectivity mode."
        )
        return 1

    # Checking updated election strategy in mon map
    strategy = mon_obj.get_election_strategy()
    if strategy != 3:
        log.error(
            f"cluster created election strategy other than connectivity, i.e {strategy}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    log.info(
        f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3")
    if not set_mon_sites(node=cephadm,
                         tiebreaker_node=tiebreaker_node,
                         site1=site1,
                         site2=site2):
        log.error("Failed to ad monitors into respective sites")
        return 1

    # All the existing pools should be automatically changed with stretch rule. Creating a test pool
    pool_name = "test_pool_1"
    if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
        log.error("Failed to create the replicated Pool")
        return 1

    log.info("Monitors added to respective sites. enabling stretch rule")
    cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1

    if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after Enabling strech mode")
        return 1

    if config.get("perform_add_capacity"):
        pool_name = "test_stretch_pool"
        if not rados_obj.create_pool(
                pool_name=pool_name,
                crush_rule=stretch_rule_name,
        ):
            log.error("Failed to create the replicated Pool")
            return 1
        do_rados_put(mon=client_node, pool=pool_name, nobj=1000)

        # Increasing backfill/rebalance threads so that cluster will re-balance it faster after add capacity
        rados_obj.change_recover_threads(config=config, action="set")

        log.info(
            "Performing add Capacity after the deployment of stretch cluster")
        site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds]
        site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds]

        if not set_osd_sites(
                node=cephadm,
                osds=site_a_osds,
                site=site1,
                all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1
        if not set_osd_sites(
                node=cephadm,
                osds=site_b_osds,
                site=site2,
                all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1

        # Waiting for up to 2.5 hours for the PG's to enter active + Clean state after add capacity
        # Automation for bug : [1] & [2]
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "remapped",
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "peering",
                    "recovering",
                    "recovery_wait",
                    "peering",
                    "undersized",
                    "backfilling_wait",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info(
                    "The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 2 minutes")
            time.sleep(120)
        rados_obj.change_recover_threads(config=config, action="rm")
        if not flag:
            log.error(
                "The cluster did not reach active + Clean state after add capacity"
            )
            return 1

        with parallel() as p:
            p.spawn(do_rados_get, client_node, pool_name, 10)
            for res in p:
                log.info(res)

    # Checking if the pools have been updated with the new crush rules
    acting_set = rados_obj.get_pg_acting_set(pool_name=pool_name)
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0