Beispiel #1
0
def write_to_pools(config, rados_obj, client_node):
    """
    This function will create pool, write data to pool and return pool information
    Args:
        config: config parameters from suite file
        rados_obj: RadosOrchestrator object
        client_node: client node details
    Returns:  pool information
    """
    pools = config.get("create_pools")
    for each_pool in pools:
        cr_pool = each_pool["create_pool"]
        if cr_pool.get("rados_put", False):
            do_rados_put(mon=client_node, pool=cr_pool["pool_name"], nobj=100)
        else:
            method_should_succeed(rados_obj.bench_write, **cr_pool)
Beispiel #2
0
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Actions Performed:
    1. Disables the automatic crush map update
    2. Collects the OSD daemons in the cluster and split them into 2 sites.
    3. If add capacity is selected, only half of the OSD's will be added to various sites initially.
    4. Adds the stretch rule into crush map.
    5. Adding monitors into the 2 sites.
    6. Create a replicated pool and deploy stretch mode.
    7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites )
    8. Check for the bump in election epochs throughout.
    9. Check the acting set in PG for 4 OSD's. 2 from each site.

    Verifies bugs:
    [1]. https://bugzilla.redhat.com/show_bug.cgi?id=1937088
    [2]. https://bugzilla.redhat.com/show_bug.cgi?id=1952763
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    client_node = ceph_cluster.get_nodes(role="client")[0]
    tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0]

    if not client_node and not tiebreaker_node:
        log.error(
            "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster"
        )
        return 1
    mon_state = get_mon_details(node=cephadm)
    if len(list(mon_state["monitors"])) < 5:
        log.error(
            f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}"
        )
        return 1
    osd_details = get_osd_details(node=cephadm)
    if len(osd_details.keys()) < 4:
        log.error(
            f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}"
        )
        return 1

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    site1 = config.get("site1", "site1")
    site2 = config.get("site2", "site2")

    # Collecting osd details and split them into Sita A and Site B
    sorted_osds = sort_osd_sites(all_osd_details=osd_details)
    site_a_osds = sorted_osds[0]
    site_b_osds = sorted_osds[1]
    if config.get("perform_add_capacity"):
        site_a_osds = sorted_osds[0][: (len(sorted_osds[0]) // 2)]
        site_b_osds = sorted_osds[1][: (len(sorted_osds[1]) // 2)]

    if not set_osd_sites(
        node=cephadm,
        osds=site_a_osds,
        site=site1,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    if not set_osd_sites(
        node=cephadm,
        osds=site_b_osds,
        site=site2,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    # collecting mon map to be compared after strtech cluster deployment
    stretch_rule_name = "stretch_rule"
    if not setup_crush_rule(
        node=client_node, rule_name=stretch_rule_name, site1=site1, site2=site2
    ):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Setting the election strategy to connectivity mode
    cmd = "/bin/ceph mon set election_strategy connectivity"
    cephadm.shell([cmd])

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)
    init_mon_state = get_mon_details(node=cephadm)

    # Checking if mon elections happened after changing election strategy
    if mon_state["epoch"] > init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after setting the connectivity mode.")
        return 1

    # Checking updated election strategy in mon map
    if init_mon_state["election_strategy"] != 3:
        log.error(
            f"Election strategy is not connectivity mode.\n Currently set {mon_state['election_strategy']}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    log.info(f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3")
    if not set_mon_sites(
        node=cephadm, tiebreaker_node=tiebreaker_node, site1=site1, site2=site2
    ):
        log.error("Failed to ad monitors into respective sites")
        return 1

    # All the existing pools should be automatically changed with stretch rule. Creating a test pool
    pool_name = "test_pool_1"
    if not create_pool(
        node=cephadm, disable_pg_autoscale=True, pool_name=pool_name, pg_num=16
    ):
        log.error("Failed to create the replicated Pool")
        return 1

    log.info("Monitors added to respective sites. enabling stretch rule")
    cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1

    if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after Enabling strech mode")
        return 1

    if config.get("perform_add_capacity"):
        pool_name = "test_stretch_pool"
        if not create_pool(
            node=cephadm,
            disable_pg_autoscale=True,
            pool_name=pool_name,
            crush_rule=stretch_rule_name,
        ):
            log.error("Failed to create the replicated Pool")
            return 1
        do_rados_put(mon=client_node, pool=pool_name, nobj=1000)

        # Increasing backfill/rebalance threads so that cluster will re-balance it faster after add capacity
        change_recover_threads(node=cephadm, config=config, action="set")

        log.info("Performing add Capacity after the deployment of stretch cluster")
        site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds]
        site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds]

        if not set_osd_sites(
            node=cephadm,
            osds=site_a_osds,
            site=site1,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1
        if not set_osd_sites(
            node=cephadm,
            osds=site_b_osds,
            site=site2,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1

        # Waiting for up to 2.5 hours for the PG's to enter active + Clean state after add capacity
        # Automation for bug : [1] & [2]
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        flag = True
        while end_time > datetime.datetime.now():
            status_report = run_ceph_command(node=cephadm, cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "remapped",
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "peering",
                    "recovering",
                    "recovery_wait",
                    "peering",
                    "undersized",
                    "backfilling_wait",
                )
                flag = (
                    False
                    if any(key in rec for key in entry["state"].split("+"))
                    else True
                )

            if flag:
                log.info("The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 2 minutes"
            )
            time.sleep(120)
        change_recover_threads(node=cephadm, config=config, action="rm")
        if not flag:
            log.error(
                "The cluster did not reach active + Clean state after add capacity"
            )
            return 1

        with parallel() as p:
            p.spawn(do_rados_get, client_node, pool_name, 10)
            for res in p:
                log.info(res)

    # Checking if the pools have been updated with the new crush rules
    acting_set = get_pg_acting_set(node=cephadm, pool_name=pool_name)
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0
Beispiel #3
0
def run(ceph_cluster, **kw):
    """
    Automates OSD re-balance test scenarios.
    1. Create replicated and/or erasure pool/pools
    2. Identify the osd to be removed
    3. Fetch the host by daemon_type=osd and osd id
    4. Fetch container id and device path
    5. Mark osd out and wait for pgs to be active+clean
    6. Remove OSD
    7. Zap device and wait for device not present
    8. Add OSD and wait for device present and pgs to be active+clean
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    client_node = ceph_cluster.get_nodes(role="client")[0]

    log.info("Running create pool test case")
    if config.get("create_pools"):
        pools = config.get("create_pools")
        for each_pool in pools:
            cr_pool = each_pool["create_pool"]
            if cr_pool.get("pool_type", "replicated") == "erasure":
                method_should_succeed(rados_obj.create_erasure_pool,
                                      name=cr_pool["pool_name"],
                                      **cr_pool)
            else:
                method_should_succeed(rados_obj.create_pool,
                                      pool_name=cr_pool["pool_name"],
                                      **cr_pool)
            method_should_succeed(rados_obj.bench_write, **cr_pool)
        pool = random.choice(pools)["create_pool"]
    if not pool:
        log.error("Failed to retrieve pool details")
        return 1

    rados_obj.change_recover_threads(config=pool, action="set")
    acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool["pool_name"])
    log.info(f"Acting set {acting_pg_set}")
    if not acting_pg_set:
        log.error("Failed to retrieve acting pg set")
        return 1
    osd_id = acting_pg_set[0]
    host = rados_obj.fetch_host_node(daemon_type="osd", daemon_id=osd_id)
    if not host:
        log.error("Failed to fetch host details")
        return 1
    # fetch container id
    out, _ = host.exec_command(sudo=True, cmd="podman ps --format json")
    container_id = [
        item["Names"][0] for item in json.loads(out.read().decode())
        if f"osd.{osd_id}" in item["Command"]
    ][0]
    if not container_id:
        log.error("Failed to retrieve container id")
        return 1
    # fetch device path by osd_id
    vol_out, _ = host.exec_command(
        sudo=True,
        cmd=f"podman exec {container_id} ceph-volume lvm list --format json",
    )
    volume_out = vol_out.read().decode()
    dev_path = [
        v[0]["devices"][0] for k, v in json.loads(volume_out).items()
        if str(k) == str(osd_id)
    ][0]
    if not dev_path:
        log.error("Failed to get device path")
        return 1
    log.debug(
        f"device path  : {dev_path}, osd_id : {osd_id}, host.hostname : {host.hostname}"
    )
    utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=True)
    method_should_succeed(utils.set_osd_out, ceph_cluster, osd_id)
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    utils.osd_remove(ceph_cluster, osd_id)
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    method_should_succeed(utils.zap_device, ceph_cluster, host.hostname,
                          dev_path)
    method_should_succeed(wait_for_device,
                          host,
                          container_id,
                          osd_id,
                          action="remove")
    utils.add_osd(ceph_cluster, host.hostname, dev_path, osd_id)
    method_should_succeed(wait_for_device,
                          host,
                          container_id,
                          osd_id,
                          action="add")
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    do_rados_put(mon=client_node, pool=pool["pool_name"], nobj=1000)
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=False)
    rados_obj.change_recover_threads(config=pool, action="rm")

    if config.get("delete_pools"):
        for name in config["delete_pools"]:
            method_should_succeed(rados_obj.detete_pool, name)
        log.info("deleted all the given pools successfully")

    return 0
def run(ceph_cluster, **kw) -> int:
    """
    Test to copy data from one pool to another
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    pool_obj = PoolFunctions(node=cephadm)
    client_node = ceph_cluster.get_nodes(role="client")[0]

    pool_configs_path = "conf/pacific/rados/test-confs/pool-configurations.yaml"

    with open(pool_configs_path, "r") as fd:
        pool_configs = yaml.safe_load(fd)

    pool_orig = pool_configs[config["pool-1-type"]][config["pool-1-conf"]]
    pool_target = pool_configs[config["pool-2-type"]][config["pool-2-conf"]]
    create_given_pool(rados_obj, pool_orig)
    create_given_pool(rados_obj, pool_target)

    # Writing objects with huge omap entries
    if not pool_obj.fill_omap_entries(pool_name=pool_orig["pool_name"],
                                      obj_end=500):
        log.error(
            f"Omap entries not generated on pool {pool_orig['pool_name']}")
        return 1

    do_rados_put(mon=client_node, pool=pool_orig["pool_name"], nobj=1000)

    snapshots = []
    for _ in range(5):
        snap = pool_obj.create_pool_snap(pool_name=pool_orig["pool_name"])
        if snap:
            snapshots.append(snap)
        else:
            log.error("Could not create snapshot on the pool")
            return 1

    # Using cppool to copy contents b/w the pools
    cmd = f"rados cppool {pool_orig['pool_name']} {pool_target['pool_name']}"
    client_node.exec_command(sudo=True, cmd=cmd, long_running=True)

    # Sleeping for 2 seconds after copy to perform get operations
    time.sleep(2)

    do_rados_get(client_node, pool_target["pool_name"], 1)

    # Checking if the snapshots of pool was also copied
    # Snapshots of pool should not be copied
    for snap_name in snapshots:
        if pool_obj.check_snap_exists(snap_name=snap_name,
                                      pool_name=pool_target["pool_name"]):
            log.error("Snapshot of pool exists")
            return 1

    # deleting the Target pool created after cppool
    rados_obj.detete_pool(pool=pool_target["pool_name"])

    # Creating new target pool to test import/export
    create_given_pool(rados_obj, pool_target)

    # Creating temp file to hold pool info
    client_node.exec_command(cmd="touch /tmp/file", )

    # crating export of data on old pool
    cmd = f"rados export -p {pool_orig['pool_name']} /tmp/file"
    client_node.exec_command(sudo=True, cmd=cmd, long_running=True)

    # Importing the file into the new pool
    cmd = f"rados import -p {pool_target['pool_name']} /tmp/file"
    client_node.exec_command(sudo=True, cmd=cmd, long_running=True)

    # Sleeping for 2 seconds after copy to perform get operations
    time.sleep(2)

    do_rados_get(client_node, pool_target["pool_name"], 1)

    # Checking if the snapshots of pool was also copied
    # Snapshots of pool should not be copied
    for snap_name in snapshots:
        if pool_obj.check_snap_exists(snap_name=snap_name,
                                      pool_name=pool_target["pool_name"]):
            log.error("Snapshot of pool exists")
            return 1

    # deleting the Original & Target pool created after cppool
    rados_obj.detete_pool(pool=pool_target["pool_name"])
    rados_obj.detete_pool(pool=pool_orig["pool_name"])
    return 0
Beispiel #5
0
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Actions Performed:
    1. Disables the automatic crush map update
    2. Collects the OSD daemons in the cluster and split them into 2 sites.
    3. If add capacity is selected, only half of the OSD's will be added to various sites initially.
    4. Adds the stretch rule into crush map.
    5. Adding monitors into the 2 sites.
    6. Create a replicated pool and deploy stretch mode.
    7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites )
    8. Check for the bump in election epochs throughout.
    9. Check the acting set in PG for 4 OSD's. 2 from each site.
    Verifies bugs:
    [1]. https://bugzilla.redhat.com/show_bug.cgi?id=1937088
    [2]. https://bugzilla.redhat.com/show_bug.cgi?id=1952763
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonElectionStrategies(rados_obj=rados_obj)
    client_node = ceph_cluster.get_nodes(role="client")[0]
    tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0]

    if not client_node and not tiebreaker_node:
        log.error(
            "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster"
        )
        return 1
    mon_state = get_mon_details(node=cephadm)
    if len(list(mon_state["monitors"])) < 5:
        log.error(
            f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}"
        )
        return 1
    osd_details = get_osd_details(node=cephadm)
    if len(osd_details.keys()) < 4:
        log.error(
            f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}"
        )
        return 1

    if config.get("verify_forced_recovery"):
        log.info("Verifying forced recovery and healthy in stretch environment")

        pool_name = "stretch_pool_recovery"
        if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
            log.error("Failed to create the replicated Pool")
            return 1

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool_name)

        log.info(
            f"Killing 2 OSD's from acting set : {acting_pg_set} to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(2)]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        log.info("Stopped 2 OSD's from acting set, starting to wait for recovery")
        rados_obj.change_recover_threads(config=config, action="set")

        if not rados_obj.bench_write(pool_name=pool_name, **config):
            log.error("Failed to write objects into the Pool")
            return 1

        log.debug("Triggering forced recovery in stretch mode")
        cmd = "ceph osd force_recovery_stretch_mode --yes-i-really-mean-it"
        rados_obj.run_ceph_command(cmd)
        log.info("Triggered the recovery in stretch mode")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # there was data written into pool when the OSD's were down.
        # Verifying if data is recovered and placed into the OSD's after bringing them back
        res = wait_for_clean_pg_sets(rados_obj)
        if not res:
            log.error("PG's in cluster are not active + Clean ")
            return 1

        log.debug("Forcing the stretch cluster into healthy mode")
        cmd = "ceph osd force_healthy_stretch_mode --yes-i-really-mean-it"
        rados_obj.run_ceph_command(cmd)

        log.info("Cluster has successfully recovered and is in healthy state")
        return 0

    # Finding and Deleting any stray EC pools that might have been left on cluster
    pool_dump = rados_obj.run_ceph_command(cmd="ceph osd dump")
    for entry in pool_dump["pools"]:
        if entry["type"] != 1 and entry["crush_rule"] != 0:
            log.info(
                f"A non-replicated pool found : {entry['pool_name']}, proceeding to delete pool"
            )
            if not rados_obj.detete_pool(pool=entry["pool_name"]):
                log.error(f"the pool {entry['pool_name']} could not be deleted")
                return 1
        log.debug("No pools other than replicated found on cluster")

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    site1 = config.get("site1", "site1")
    site2 = config.get("site2", "site2")

    # Collecting osd details and split them into Sita A and Site B
    sorted_osds = sort_osd_sites(all_osd_details=osd_details)
    site_a_osds = sorted_osds[0]
    site_b_osds = sorted_osds[1]
    if config.get("perform_add_capacity"):
        site_a_osds = sorted_osds[0][: (len(sorted_osds[0]) // 2)]
        site_b_osds = sorted_osds[1][: (len(sorted_osds[1]) // 2)]

    if not set_osd_sites(
        node=cephadm,
        osds=site_a_osds,
        site=site1,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    if not set_osd_sites(
        node=cephadm,
        osds=site_b_osds,
        site=site2,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    # collecting mon map to be compared after strtech cluster deployment
    stretch_rule_name = "stretch_rule"
    if not setup_crush_rule(
        node=client_node, rule_name=stretch_rule_name, site1=site1, site2=site2
    ):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Setting the election strategy to connectivity mode
    if not mon_obj.set_election_strategy(mode="connectivity"):
        log.error("could not set election strategy to connectivity mode")
        return 1

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)
    init_mon_state = get_mon_details(node=cephadm)

    # Checking if mon elections happened after changing election strategy
    if mon_state["epoch"] > init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after setting the connectivity mode.")
        return 1

    # Checking updated election strategy in mon map
    strategy = mon_obj.get_election_strategy()
    if strategy != 3:
        log.error(
            f"cluster created election strategy other than connectivity, i.e {strategy}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    log.info(f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3")
    if not set_mon_sites(
        node=cephadm, tiebreaker_node=tiebreaker_node, site1=site1, site2=site2
    ):
        log.error("Failed to ad monitors into respective sites")
        return 1

    # All the existing pools should be automatically changed with stretch rule. Creating a test pool
    pool_name = "test_pool_1"
    if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
        log.error("Failed to create the replicated Pool")
        return 1

    log.info("Monitors added to respective sites. enabling stretch rule")
    cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1

    if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after Enabling strech mode")
        return 1

    # Increasing backfill/rebalance threads so that cluster will re-balance it faster
    rados_obj.change_recover_threads(config=config, action="set")

    # wait for active + clean after deployment of stretch mode
    # checking the state after deployment coz of BZ : https://bugzilla.redhat.com/show_bug.cgi?id=2025800
    res = wait_for_clean_pg_sets(rados_obj)
    if not res:
        status_report = rados_obj.run_ceph_command(cmd="ceph report")
        # Proceeding to check if all PG's are in active + clean
        for entry in status_report["num_pg_by_state"]:
            rec = ("remapped", "peering")
            if any(key in rec for key in entry["state"].split("+")):
                log.error(
                    "PG's in cluster are stuck in remapped+peering after stretch deployment."
                )
                return 1

    if config.get("perform_add_capacity"):
        pool_name = "test_stretch_pool"
        if not rados_obj.create_pool(
            pool_name=pool_name,
            crush_rule=stretch_rule_name,
        ):
            log.error("Failed to create the replicated Pool")
            return 1
        do_rados_put(mon=client_node, pool=pool_name, nobj=100)

        log.info("Performing add Capacity after the deployment of stretch cluster")
        site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds]
        site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds]

        if not set_osd_sites(
            node=cephadm,
            osds=site_a_osds,
            site=site1,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1
        if not set_osd_sites(
            node=cephadm,
            osds=site_b_osds,
            site=site2,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1

        flag = wait_for_clean_pg_sets(rados_obj)
        if not flag:
            log.error(
                "The cluster did not reach active + Clean state after add capacity"
            )
            return 1

        with parallel() as p:
            p.spawn(do_rados_get, client_node, pool_name, 10)
            for res in p:
                log.info(res)
        log.info("Successfully completed Add Capacity scenario")

    rados_obj.change_recover_threads(config=config, action="rm")

    # Checking if the pools have been updated with the new crush rules
    acting_set = rados_obj.get_pg_acting_set(pool_name=pool_name)
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0
Beispiel #6
0
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Actions Performed:
    1. Disables the automatic crush map update
    2. Collects the OSD daemons in the cluster and split them into 2 sites.
    3. If add capacity is selected, only half of the OSD's will be added to various sites initially.
    4. Adds the stretch rule into crush map.
    5. Adding monitors into the 2 sites.
    6. Create a replicated pool and deploy stretch mode.
    7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites )
    8. Check for the bump in election epochs throughout.
    9. Check the acting set in PG for 4 OSD's. 2 from each site.
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    client_node = ceph_cluster.get_nodes(role="client")[0]
    tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0]

    if not client_node and not tiebreaker_node:
        log.error(
            "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster"
        )
        return 1
    mon_state = get_mon_details(node=cephadm)
    if len(list(mon_state["monitors"])) < 5:
        log.error(
            f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}"
        )
        return 1
    osd_details = get_osd_details(node=cephadm)
    if len(osd_details.keys()) < 4:
        log.error(
            f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}"
        )
        return 1

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    # Collecting osd details and split them into Sita A and Site B
    sorted_osds = sort_osd_sites(all_osd_details=osd_details)
    site_a_osds = sorted_osds[0]
    site_b_osds = sorted_osds[1]
    if config.get("perform_add_capacity"):
        site_a_osds = sorted_osds[0][:(len(sorted_osds[0]) // 2)]
        site_b_osds = sorted_osds[1][:(len(sorted_osds[1]) // 2)]

    if not set_osd_sites(
            node=cephadm,
            osds=site_a_osds,
            site=1,
            all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    if not set_osd_sites(
            node=cephadm,
            osds=site_b_osds,
            site=2,
            all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    # collecting mon map to be compared after strtech cluster deployment
    stretch_rule_name = "stretch_rule"
    if not setup_crush_rule(node=client_node, rule_name=stretch_rule_name):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Setting the election strategy to connectivity mode
    cmd = "/bin/ceph mon set election_strategy connectivity"
    cephadm.shell([cmd])

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)
    init_mon_state = get_mon_details(node=cephadm)

    # Checking if mon elections happened after changing election strategy
    if mon_state["epoch"] > init_mon_state["epoch"]:
        log.error(
            "Election epoch not bumped up after setting the connectivity mode."
        )
        return 1

    # Checking updated election strategy in mon map
    if init_mon_state["election_strategy"] != 3:
        log.error(
            f"Election strategy is not connectivity mode.\n Currently set {mon_state['election_strategy']}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    log.info(
        f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3")
    if not set_mon_sites(node=cephadm, tiebreaker_node=tiebreaker_node):
        log.error("Failed to ad monitors into respective sites")
        return 1

    # All the existing pools should be automatically changed with stretch rule. Creating a test pool
    pool_name = "test_pool_1"
    if not create_pool(node=cephadm,
                       disable_pg_autoscale=True,
                       pool_name=pool_name,
                       pg_num=16):
        log.error("Failed to create the replicated Pool")
        return 1

    log.info("Monitors added to respective sites. enabling stretch rule")
    cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1

    if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after Enabling strech mode")
        return 1

    if config.get("perform_add_capacity"):
        pool_name = "test_stretch_pool"
        if not create_pool(
                node=cephadm,
                disable_pg_autoscale=True,
                pool_name=pool_name,
                crush_rule=stretch_rule_name,
        ):
            log.error("Failed to create the replicated Pool")
            return 1
        do_rados_put(mon=client_node, pool=pool_name, nobj=100)

        log.info(
            "Performing add Capacity after the deployment of stretch cluster")
        site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds]
        site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds]

        if not set_osd_sites(
                node=cephadm,
                osds=site_a_osds,
                site=1,
                all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1
        if not set_osd_sites(
                node=cephadm,
                osds=site_b_osds,
                site=2,
                all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1

        # Sleeping for 10 seconds after adding OSD's for the PG re-balancing to start and begin rados get
        time.sleep(10)
        with parallel() as p:
            p.spawn(do_rados_get, client_node, pool_name, 10)
            for res in p:
                log.info(res)

    # Checking if the pools have been updated with the new crush rules
    acting_set = get_pg_acting_set(node=cephadm, pool_name=pool_name)
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0