Beispiel #1
0
def run(ceph_cluster, **kw):
    """Ceph-admin module to manage ceph-dashboard service.

    check ceph.ceph_admin.dashboard for test config.

    Args:
        ceph_cluster (ceph.ceph.Ceph): Ceph cluster object.
        kw: keyword arguments from test data.

    Returns:
        value 0 on success.

    """
    log.info("Running Ceph-admin Dashboard test")
    config = kw.get("config")

    build = config.get("build", config.get("rhbuild"))
    ceph_cluster.rhcs_version = build

    # Manage Ceph using ceph-admin orchestration
    command = config.pop("command")
    log.info("Executing dashboard %s operation" % command)
    instance = CephAdmin(cluster=ceph_cluster, **config)

    try:
        method = fetch_method(dashboard, command)
        method(instance, config.get("args"))
    finally:
        # Get cluster state
        get_cluster_state(instance)
    return 0
def run(ceph_cluster, **kw):
    """
    Prepares the cluster & runs rados Customer Scenarios.
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
        kw: Args that need to be passed to the test for initialization
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    if config.get("mondb_trim_config"):
        db_config = config.get("mondb_trim_config")
        try:
            verify_mon_db_trim(ceph_cluster=ceph_cluster,
                               node=cephadm,
                               **db_config)
            log.info("Mon DB is getting trimmed regularly")
        except (TestCaseFailureException, TestBedSetupFailure):
            log.error("Failed to verify mon db trimming")
            return 1

    log.info("Completed running the customer Scenario(s)")
    return 0
Beispiel #3
0
def run(ceph_cluster, **kw):
    """
    Cephadm Bootstrap

    Args:
        ceph_cluster (ceph.ceph.Ceph): Ceph cluster object
        kw: test data

    - Bootstrap cluster with default or custom image and
      returns after cephadm.bootstrap. To use default image, set 'registry'.

        Example:
            config:
                command: bootstrap
                base_cmd_args:
                    verbose: true
                args:
                    custom_image: true | false
                    mon-ip: <node_name>
                    mgr-id: <mgr_id>
                    fsid: <id>
    """
    config = kw.get("config")
    build = config.get("build", config.get("rhbuild"))
    ceph_cluster.rhcs_version = build
    config["overrides"] = kw.get("test_data", {}).get("custom-config")

    # Manage Ceph using ceph-admin orchestration
    command = config.pop("command")
    service = config.pop("service", "")
    log.info("Executing %s %s" % (service, command))

    instance = CephAdmin(cluster=ceph_cluster, **config)
    if "shell" in command:
        instance.shell(args=config["args"])
        return 0
    try:
        method = fetch_method(instance, command)
        out, err = method(config)

        # Verification of arguments
        # bootstrap response through stdout & stderr are combined here
        # currently console response coming through stderr.
        args = config.get("args", {})
        verify_bootstrap(instance, args, out + err)
    finally:
        # Get cluster state
        get_cluster_state(instance)
    return 0
Beispiel #4
0
def run(ceph_cluster, **kw):
    """
    Verifies the config change history in monitor configuration database changes
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)

    # getting the last config change, to which we will roll back later
    init_config = mon_obj.get_ceph_log(count=1)[0]
    log.info("Config at the beginning of test. \n"
             f"Version: {init_config['version']}"
             f"Changes made: {init_config['changes']}")

    log.info(
        "Setting new changes and verifying if the changes are reflected in the log"
    )
    if not mon_obj.set_config(section="osd", name="osd_max_scrubs", value="8"):
        log.error("Error setting config ")
        return 1

    # Checking the versions and changes made.
    test_config = mon_obj.get_ceph_log(count=1)[0]
    log.info("Config changes made for test. \n"
             f"Version: {test_config['version']}"
             f"Changes made: {test_config['changes']}")

    if not test_config["version"] > init_config["version"]:
        log.error(f"The log is not updated with new config changes."
                  f"Version: {test_config['version']}")
        return 1
    try:
        name = test_config["changes"][0].get("name")
        value = str(test_config["changes"][0].get("new_value"))
        if not name == "osd/osd_max_scrubs" and value == "8":
            log.error(f"The log is not updated with new config changes."
                      f"Changes made: {test_config['changes']}")
            return 1
    except Exception:
        log.error(
            "The log collected does not contain the value and changes made")
        return 1

    log.info("The ceph config log is successfully updated after changes ")
    return 0
Beispiel #5
0
def run(ceph_cluster, **kw):
    """
    Verifies the config change reverts in monitor configuration database changes taken from logs
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)

    init_config = mon_obj.get_ceph_log(count=1)[0]
    if not mon_obj.set_config(
            section="mon", name="mon_max_log_epochs", value="1000"):
        log.error("Error setting config ")
        return 1
    log.info(
        f"Proceeding with reverting the last config change, selecting version: {init_config['version']}"
    )
    if not mon_obj.ceph_config_reset(version=init_config["version"]):
        log.error(
            f"Could not revert to the selected version : {init_config['version']}"
        )
        return 1

    log.info(
        "Reverted to selected version. Checking if the config value is removed"
    )
    if mon_obj.verify_set_config(section="mon",
                                 name="mon_max_log_epochs",
                                 value="1000"):
        log.error("Config is still set after the reset")
        return 1

    test_config = mon_obj.get_ceph_log(count=1)[0]
    log.info(
        f"reverted successfully to previous versions. config log : {test_config}"
    )

    log.info("The ceph config log is successfully updated after changes ")
    return 0
Beispiel #6
0
def operator(test_config, step_config, **kw):
    """
    Using the provided test config file, this method triggers SDK calls of RBD
    of that specific scenario

    Arguments:
        test_config: containing the key/value pairs passed from the test-suite
        step_config: arguments required for a specific operation
        args: test data

    Returns:
        0 on success or 1 for failures
    """
    if step_config.get("method") == "shell":
        cephadm = CephAdmin(kw["ceph_cluster_dict"], test_config)
        cephadm.shell(args=step_config["args"])
    else:
        # maintain dictionary to map to classes based on service
        # instantiate class
        instance = CLASS_MAP[step_config["class"]](nodes=kw["ceph_nodes"])
        method = getattr(instance, step_config["method"])
        log.info(method)
        method(step_config["args"])
    return 0
def run(ceph_cluster, **kw):
    """
    Runs cephadm deployment
    Args:
        ceph_cluster (ceph.ceph.Ceph): Ceph cluster object
    """
    log.info("Running test")
    log.info("Running cephadm test")
    config = kw.get('config')
    test_data = kw.get('test_data')
    # base_url = config.get('base_url', None)
    ceph_cluster.custom_config = test_data.get('custom-config')
    ceph_cluster.custom_config_file = test_data.get('custom-config-file')

    build = config.get('build', config.get('rhbuild'))
    name = config.get('cluster_name', 'ceph')
    # deploy = config.get('deploy', False)
    exec_shell = config.get('exec_shell', False)
    ceph_cluster.rhcs_version = build

    if config.get('skip_setup') is True:
        log.info("Skipping setup of ceph cluster")
        return 0

    # get installer node
    ceph_installer = ceph_cluster.get_ceph_object('installer')
    cephadm = CephAdmin(name=name,
                        ceph_cluster=ceph_cluster,
                        ceph_installer=ceph_installer,
                        **config)

    # Deployment-only
    # if config.get('deployment'):
    #     cephadm.deploy()
    #     return 0

    if exec_shell:
        for cmd in exec_shell:
            cmd = cmd if isinstance(cmd, list) else [cmd]
            cephadm.shell(
                remote=ceph_installer,
                args=cmd,
            )
        return 0

    # copy ssh keys to other hosts
    ceph_cluster.setup_ssh_keys()

    # set tool download repository
    cephadm.set_tool_repo()

    # install/download cephadm package on installer
    cephadm.install_cephadm()

    # bootstrap cluster
    cephadm.bootstrap()

    # add all hosts
    cephadm.manage_hosts()

    # add all daemons
    cephadm.add_daemons()

    return 0
def run(ceph_cluster, **kw):
    """
    Polarion ID: https://polarion.engineering.redhat.com/polarion/#/project/CEPH/workitem?id=CEPH-83573854
    Feature : Mute Health alerts : https://trello.com/c/AU8FT6Qp/27-mute-health-warnings
    RFE Bug : https://bugzilla.redhat.com/show_bug.cgi?id=1821508
    upstream doc : https://docs.ceph.com/en/octopus/rados/operations/monitoring/#muting-health-checks

    1. Check the cluster health status and simulate various failures.
    2. Verify the health warning/ error generated due to the failures.
    3. Mute the failures, test the various arguments and verify that the alerts are muted successfully
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Running test for feature : Mute Health Alerts")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    all_alerts = get_alerts(cephadm)
    alert_list = ["MON_DISK_BIG", "OSDMAP_FLAGS"]
    if all_alerts["active_alerts"]:
        log.info(
            f"There are health alerts generated on the cluster. Alerts : {all_alerts}"
        )
    else:
        log.info(
            "Cluster Health is ok. \n Generating a alert to verify feature")

    # Scenario 1 : Verify the auto-unmute of alert after TTL
    log.info("Scenario 1: Verify the auto-unmute of alert after TTL")
    alert = alert_list[1]
    if not verify_alert_with_ttl(
            node=cephadm, alert=alert, ttl=8, flag="noscrub"):
        log.error(f"Scenario 1 for alert : {alert} has failed")
        return 1

    # Scenario 2 : Verify the auto-unmute of alert if the health alert is generated again. ( without sticky )
    log.info(
        "Scenario 2: Verify the auto-unmute of alert if the health alert is generated again."
    )
    alert = alert_list[1]
    if not verify_alert_unmute(node=cephadm, alert=alert):
        log.error(f"Scenario 2 for alert : {alert} has failed")
        return 1

    # Scenario 3 : Verify the auto-unmute of alert if the health alert is generated again. ( with sticky )
    log.info(
        "Scenario 3: Verify the auto-unmute of alert if the health alert is generated again."
    )
    alert = alert_list[1]
    if not verify_alert_unmute(node=cephadm, alert=alert, sticky=True):
        log.error(f"Scenario 3 for alert : {alert} has failed")
        return 1

    log.info(f"All the current alerts : {get_alerts(cephadm)}")
    # Scenario 4 : Verify the unmute command with sticky
    log.info("Scenario 4 : Verify the unmute command")
    alert = alert_list[0]
    if not verify_unmute_cli(node=cephadm, alert=alert, sticky=True):
        log.error(f"Scenario 4 for alert : {alert} has failed")
        return 1

    log.info("All the scenarios have passed")
    return 0
Beispiel #9
0
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Actions Performed:
    1. Disables the automatic crush map update
    2. Collects the OSD daemons in the cluster and split them into 2 sites.
    3. If add capacity is selected, only half of the OSD's will be added to various sites initially.
    4. Adds the stretch rule into crush map.
    5. Adding monitors into the 2 sites.
    6. Create a replicated pool and deploy stretch mode.
    7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites )
    8. Check for the bump in election epochs throughout.
    9. Check the acting set in PG for 4 OSD's. 2 from each site.
    Verifies bugs:
    [1]. https://bugzilla.redhat.com/show_bug.cgi?id=1937088
    [2]. https://bugzilla.redhat.com/show_bug.cgi?id=1952763
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonElectionStrategies(rados_obj=rados_obj)
    client_node = ceph_cluster.get_nodes(role="client")[0]
    tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0]

    if not client_node and not tiebreaker_node:
        log.error(
            "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster"
        )
        return 1
    mon_state = get_mon_details(node=cephadm)
    if len(list(mon_state["monitors"])) < 5:
        log.error(
            f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}"
        )
        return 1
    osd_details = get_osd_details(node=cephadm)
    if len(osd_details.keys()) < 4:
        log.error(
            f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}"
        )
        return 1

    if config.get("verify_forced_recovery"):
        log.info("Verifying forced recovery and healthy in stretch environment")

        pool_name = "stretch_pool_recovery"
        if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
            log.error("Failed to create the replicated Pool")
            return 1

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool_name)

        log.info(
            f"Killing 2 OSD's from acting set : {acting_pg_set} to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(2)]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        log.info("Stopped 2 OSD's from acting set, starting to wait for recovery")
        rados_obj.change_recover_threads(config=config, action="set")

        if not rados_obj.bench_write(pool_name=pool_name, **config):
            log.error("Failed to write objects into the Pool")
            return 1

        log.debug("Triggering forced recovery in stretch mode")
        cmd = "ceph osd force_recovery_stretch_mode --yes-i-really-mean-it"
        rados_obj.run_ceph_command(cmd)
        log.info("Triggered the recovery in stretch mode")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # there was data written into pool when the OSD's were down.
        # Verifying if data is recovered and placed into the OSD's after bringing them back
        res = wait_for_clean_pg_sets(rados_obj)
        if not res:
            log.error("PG's in cluster are not active + Clean ")
            return 1

        log.debug("Forcing the stretch cluster into healthy mode")
        cmd = "ceph osd force_healthy_stretch_mode --yes-i-really-mean-it"
        rados_obj.run_ceph_command(cmd)

        log.info("Cluster has successfully recovered and is in healthy state")
        return 0

    # Finding and Deleting any stray EC pools that might have been left on cluster
    pool_dump = rados_obj.run_ceph_command(cmd="ceph osd dump")
    for entry in pool_dump["pools"]:
        if entry["type"] != 1 and entry["crush_rule"] != 0:
            log.info(
                f"A non-replicated pool found : {entry['pool_name']}, proceeding to delete pool"
            )
            if not rados_obj.detete_pool(pool=entry["pool_name"]):
                log.error(f"the pool {entry['pool_name']} could not be deleted")
                return 1
        log.debug("No pools other than replicated found on cluster")

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    site1 = config.get("site1", "site1")
    site2 = config.get("site2", "site2")

    # Collecting osd details and split them into Sita A and Site B
    sorted_osds = sort_osd_sites(all_osd_details=osd_details)
    site_a_osds = sorted_osds[0]
    site_b_osds = sorted_osds[1]
    if config.get("perform_add_capacity"):
        site_a_osds = sorted_osds[0][: (len(sorted_osds[0]) // 2)]
        site_b_osds = sorted_osds[1][: (len(sorted_osds[1]) // 2)]

    if not set_osd_sites(
        node=cephadm,
        osds=site_a_osds,
        site=site1,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    if not set_osd_sites(
        node=cephadm,
        osds=site_b_osds,
        site=site2,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    # collecting mon map to be compared after strtech cluster deployment
    stretch_rule_name = "stretch_rule"
    if not setup_crush_rule(
        node=client_node, rule_name=stretch_rule_name, site1=site1, site2=site2
    ):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Setting the election strategy to connectivity mode
    if not mon_obj.set_election_strategy(mode="connectivity"):
        log.error("could not set election strategy to connectivity mode")
        return 1

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)
    init_mon_state = get_mon_details(node=cephadm)

    # Checking if mon elections happened after changing election strategy
    if mon_state["epoch"] > init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after setting the connectivity mode.")
        return 1

    # Checking updated election strategy in mon map
    strategy = mon_obj.get_election_strategy()
    if strategy != 3:
        log.error(
            f"cluster created election strategy other than connectivity, i.e {strategy}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    log.info(f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3")
    if not set_mon_sites(
        node=cephadm, tiebreaker_node=tiebreaker_node, site1=site1, site2=site2
    ):
        log.error("Failed to ad monitors into respective sites")
        return 1

    # All the existing pools should be automatically changed with stretch rule. Creating a test pool
    pool_name = "test_pool_1"
    if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
        log.error("Failed to create the replicated Pool")
        return 1

    log.info("Monitors added to respective sites. enabling stretch rule")
    cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1

    if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after Enabling strech mode")
        return 1

    # Increasing backfill/rebalance threads so that cluster will re-balance it faster
    rados_obj.change_recover_threads(config=config, action="set")

    # wait for active + clean after deployment of stretch mode
    # checking the state after deployment coz of BZ : https://bugzilla.redhat.com/show_bug.cgi?id=2025800
    res = wait_for_clean_pg_sets(rados_obj)
    if not res:
        status_report = rados_obj.run_ceph_command(cmd="ceph report")
        # Proceeding to check if all PG's are in active + clean
        for entry in status_report["num_pg_by_state"]:
            rec = ("remapped", "peering")
            if any(key in rec for key in entry["state"].split("+")):
                log.error(
                    "PG's in cluster are stuck in remapped+peering after stretch deployment."
                )
                return 1

    if config.get("perform_add_capacity"):
        pool_name = "test_stretch_pool"
        if not rados_obj.create_pool(
            pool_name=pool_name,
            crush_rule=stretch_rule_name,
        ):
            log.error("Failed to create the replicated Pool")
            return 1
        do_rados_put(mon=client_node, pool=pool_name, nobj=100)

        log.info("Performing add Capacity after the deployment of stretch cluster")
        site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds]
        site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds]

        if not set_osd_sites(
            node=cephadm,
            osds=site_a_osds,
            site=site1,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1
        if not set_osd_sites(
            node=cephadm,
            osds=site_b_osds,
            site=site2,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1

        flag = wait_for_clean_pg_sets(rados_obj)
        if not flag:
            log.error(
                "The cluster did not reach active + Clean state after add capacity"
            )
            return 1

        with parallel() as p:
            p.spawn(do_rados_get, client_node, pool_name, 10)
            for res in p:
                log.info(res)
        log.info("Successfully completed Add Capacity scenario")

    rados_obj.change_recover_threads(config=config, action="rm")

    # Checking if the pools have been updated with the new crush rules
    acting_set = rados_obj.get_pg_acting_set(pool_name=pool_name)
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0
Beispiel #10
0
def run(ceph_cluster, **kw):
    """
    Performs various pool related validation tests
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)

    if config.get("ec_pool_recovery_improvement"):
        ec_config = config.get("ec_pool_recovery_improvement")
        if not rados_obj.create_erasure_pool(name="recovery", **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if not rados_obj.bench_write(**ec_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**ec_config)
        log.info("Created the EC Pool, Finished writing data into the pool")

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        log.info(
            f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        log.info("Stopped 'm' number of OSD's from, starting to wait for recovery")
        rados_obj.change_recover_threads(config=ec_config, action="set")

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "recovering",
                    "recovery_wait",
                    "backfilling_wait",
                    "peered",
                    "undersized",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info("The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 1 minute"
            )
            time.sleep(60)

        # getting the acting set for the created pool after recovery
        acting_pg_set = rados_obj.get_pg_acting_set(pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        # Changing recovery threads back to default
        rados_obj.change_recover_threads(config=ec_config, action="rm")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # Sleep for 5 seconds for OSD's to join the cluster
        time.sleep(5)

        if not flag:
            log.error("The pool did not reach active + Clean state after recovery")
            return 1

        # Deleting the pool created
        if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
            log.error(f"the pool {ec_config['pool_name']} could not be deleted")
            return 1

        log.info("Successfully tested EC pool recovery with K osd's surviving")
        return 0
Beispiel #11
0
def run(ceph_cluster, **kw):
    """
    Prepares the cluster to run rados tests.
    Actions Performed:
    1. Create a Replicated and Erasure coded pools and write Objects into pools
    2. Setup email alerts for sending errors/warnings on the cluster.
        Verifies Bugs:
        https://bugzilla.redhat.com/show_bug.cgi?id=1849894
        https://bugzilla.redhat.com/show_bug.cgi?id=1878145
    3. Enable logging into file and check file permissions
        Verifies Bug : https://bugzilla.redhat.com/show_bug.cgi?id=1884469
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
        kw: Args that need to be passed to the test for initialization

    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)
    out, err = cephadm.shell(["uuidgen"])
    uuid = out.split("-")[0]

    if config.get("ec_pool"):
        ec_config = config.get("ec_pool")
        ec_config.setdefault("pool_name", f"ecpool_{uuid}")
        if not rados_obj.create_erasure_pool(name=uuid, **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if ec_config.get("test_overwrites_pool"):
            if not rados_obj.verify_ec_overwrites(**ec_config):
                log.error("Failed to create the EC Pool")
                return 1
        else:
            if not rados_obj.bench_write(**ec_config):
                log.error("Failed to write objects into the EC Pool")
                return 1
            rados_obj.bench_read(**ec_config)
            log.info(
                "Created the EC Pool, Finished writing data into the pool")

        if ec_config.get("delete_pool"):
            if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
                log.error("Failed to delete EC Pool")
                return 1

    if config.get("replicated_pool"):
        rep_config = config.get("replicated_pool")
        rep_config.setdefault("pool_name", f"repool_{uuid}")
        if not rados_obj.create_pool(**rep_config, ):
            log.error("Failed to create the replicated Pool")
            return 1
        if not rados_obj.bench_write(**rep_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**rep_config)
        log.info(
            "Created the replicated Pool, Finished writing data into the pool")
        if rep_config.get("delete_pool"):
            if not rados_obj.detete_pool(pool=rep_config["pool_name"]):
                log.error("Failed to delete replicated Pool")
                return 1

    if config.get("set_pool_configs"):
        changes = config["set_pool_configs"]
        pool_name = changes["pool_name"]
        configurations = changes["configurations"]
        for conf in configurations.keys():
            if not rados_obj.set_pool_property(
                    pool=pool_name, props=conf, value=configurations[conf]):
                log.error(f"failed to set property {conf} on the cluster")
                return 1
        log.info(f"made the config changes on the pool {pool_name}")

    if config.get("email_alerts"):
        alert_config = config.get("email_alerts")
        if not rados_obj.enable_email_alerts(**alert_config):
            log.error("Error while configuring email alerts")
            return 1
        log.info("email alerts configured")

    if config.get("Verify_config_parameters"):
        test_config = config.get("Verify_config_parameters")
        test_node = ceph_cluster.get_nodes(role="osd")[0]
        for conf in test_config["configurations"]:
            for entry in conf.values():
                if entry.get("location_type") == "host":
                    entry["location_value"] = test_node.hostname
                if not mon_obj.set_config(**entry):
                    log.error(f"Error setting config {conf}")
                    return 1
        log.info("done")
        pool_name = "test_pool_1"
        if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
            log.error("Failed to create the replicated Pool")
            return 1

        rados_obj.bench_write(pool_name=pool_name, rados_write_duration=50)

        # Removing test configurations
        for conf in test_config["configurations"]:
            for entry in conf.values():
                if entry.get("location_type") == "host":
                    entry["location_value"] = test_node.hostname
                if not mon_obj.remove_config(**entry):
                    log.error(f"Error setting config {conf}")
                    return 1
        log.info("finished removing values, passed")

    if config.get("log_to_file"):
        if not rados_obj.enable_file_logging():
            log.error("Error while setting config to enable logging into file")
            return 1
        log.info("Logging to file configured")

    if config.get("cluster_configuration_checks"):
        cls_config = config.get("cluster_configuration_checks")
        if not rados_obj.set_cluster_configuration_checks(**cls_config):
            log.error("Error while setting Cluster config checks")
            return 1
        log.info("Set up cluster configuration checks")

    if config.get("configure_balancer"):
        balancer_config = config.get("configure_balancer")
        if not rados_obj.enable_balancer(**balancer_config):
            log.error("Error while setting up balancer on the Cluster")
            return 1
        log.info("Set up Balancer on the cluster")

    if config.get("configure_pg_autoscaler"):
        autoscaler_config = config.get("configure_pg_autoscaler")
        if not rados_obj.configure_pg_autoscaler(**autoscaler_config):
            log.error("Error while setting up pg_autoscaler on the Cluster")
            return 1
        log.info("Set up pg_autoscaler on the cluster")

    if config.get("enable_compression"):
        compression_conf = config["enable_compression"]
        pool_name = compression_conf["pool_name"]
        for conf in compression_conf["configurations"]:
            for entry in conf.values():
                if not rados_obj.pool_inline_compression(pool_name=pool_name,
                                                         **entry):
                    log.error(
                        f"Error setting compression on pool : {pool_name} for config {conf}"
                    )
                    return 1
                if not rados_obj.bench_write(**compression_conf):
                    log.error("Failed to write objects into Pool")
                    return 1
                rados_obj.bench_read(**compression_conf)
                log.info(
                    "Created the replicated Pool, Finished writing data into the pool"
                )
        log.info("Completed compression tests")

    if config.get("delete_pools"):
        for name in config["delete_pools"]:
            if not rados_obj.detete_pool(name):
                log.error(f"the pool {name} could not be deleted")
                return 1
        log.info("deleted all the given pools successfully")

    log.info("All Pre-requisites completed to run Rados suite")
    return 0
Beispiel #12
0
def run(ceph_cluster, **kw):
    """
    Automates OSD re-balance test scenarios.
    1. Create replicated and/or erasure pool/pools
    2. Identify the osd to be removed
    3. Fetch the host by daemon_type=osd and osd id
    4. Fetch container id and device path
    5. Mark osd out and wait for pgs to be active+clean
    6. Remove OSD
    7. Zap device and wait for device not present
    8. Add OSD and wait for device present and pgs to be active+clean
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    client_node = ceph_cluster.get_nodes(role="client")[0]

    log.info("Running create pool test case")
    if config.get("create_pools"):
        pools = config.get("create_pools")
        for each_pool in pools:
            cr_pool = each_pool["create_pool"]
            if cr_pool.get("pool_type", "replicated") == "erasure":
                method_should_succeed(rados_obj.create_erasure_pool,
                                      name=cr_pool["pool_name"],
                                      **cr_pool)
            else:
                method_should_succeed(rados_obj.create_pool,
                                      pool_name=cr_pool["pool_name"],
                                      **cr_pool)
            method_should_succeed(rados_obj.bench_write, **cr_pool)
        pool = random.choice(pools)["create_pool"]
    if not pool:
        log.error("Failed to retrieve pool details")
        return 1

    rados_obj.change_recover_threads(config=pool, action="set")
    acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool["pool_name"])
    log.info(f"Acting set {acting_pg_set}")
    if not acting_pg_set:
        log.error("Failed to retrieve acting pg set")
        return 1
    osd_id = acting_pg_set[0]
    host = rados_obj.fetch_host_node(daemon_type="osd", daemon_id=osd_id)
    if not host:
        log.error("Failed to fetch host details")
        return 1
    # fetch container id
    out, _ = host.exec_command(sudo=True, cmd="podman ps --format json")
    container_id = [
        item["Names"][0] for item in json.loads(out.read().decode())
        if f"osd.{osd_id}" in item["Command"]
    ][0]
    if not container_id:
        log.error("Failed to retrieve container id")
        return 1
    # fetch device path by osd_id
    vol_out, _ = host.exec_command(
        sudo=True,
        cmd=f"podman exec {container_id} ceph-volume lvm list --format json",
    )
    volume_out = vol_out.read().decode()
    dev_path = [
        v[0]["devices"][0] for k, v in json.loads(volume_out).items()
        if str(k) == str(osd_id)
    ][0]
    if not dev_path:
        log.error("Failed to get device path")
        return 1
    log.debug(
        f"device path  : {dev_path}, osd_id : {osd_id}, host.hostname : {host.hostname}"
    )
    utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=True)
    method_should_succeed(utils.set_osd_out, ceph_cluster, osd_id)
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    utils.osd_remove(ceph_cluster, osd_id)
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    method_should_succeed(utils.zap_device, ceph_cluster, host.hostname,
                          dev_path)
    method_should_succeed(wait_for_device,
                          host,
                          container_id,
                          osd_id,
                          action="remove")
    utils.add_osd(ceph_cluster, host.hostname, dev_path, osd_id)
    method_should_succeed(wait_for_device,
                          host,
                          container_id,
                          osd_id,
                          action="add")
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    do_rados_put(mon=client_node, pool=pool["pool_name"], nobj=1000)
    method_should_succeed(wait_for_clean_pg_sets, rados_obj)
    utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=False)
    rados_obj.change_recover_threads(config=pool, action="rm")

    if config.get("delete_pools"):
        for name in config["delete_pools"]:
            method_should_succeed(rados_obj.detete_pool, name)
        log.info("deleted all the given pools successfully")

    return 0
Beispiel #13
0
def run(ceph_cluster, **kw):
    """
    Prepares the cluster to run rados tests.
    Actions Performed:
    1. Create a Replicated and Erasure coded pools and write Objects into pools
    2. Setup email alerts for sending errors/warnings on the cluster.
        Verifies Bugs:
        https://bugzilla.redhat.com/show_bug.cgi?id=1849894
        https://bugzilla.redhat.com/show_bug.cgi?id=1878145
    3. Enable logging into file and check file permissions
        Verifies Bug : https://bugzilla.redhat.com/show_bug.cgi?id=1884469
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
        kw: Args that need to be passed to the test for initialization

    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    ceph_nodes = kw.get("ceph_nodes")
    out, err = ceph_nodes[0].exec_command(cmd="uuidgen")
    uuid = out.read().strip().decode()[0:5]

    if config.get("ec_pool"):
        ec_config = config.get("ec_pool")
        ec_config.setdefault("pool_name", f"ecpool_{uuid}")
        if not create_erasure_pool(node=cephadm, name=uuid, **ec_config):
            log.error("Failed to create the EC Pool")
            return 1
        if not run_rados_bench_write(node=cephadm, **ec_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        run_rados_bench_read(node=cephadm, **ec_config)
        log.info("Created the EC Pool, Finished writing data into the pool")
        if ec_config.get("delete_pool"):
            if not detete_pool(node=cephadm, pool=ec_config["pool_name"]):
                log.error("Failed to delete EC Pool")
                return 1

    if config.get("replicated_pool"):
        rep_config = config.get("replicated_pool")
        rep_config.setdefault("pool_name", f"repool_{uuid}")
        if not create_pool(
                node=cephadm,
                **rep_config,
        ):
            log.error("Failed to create the replicated Pool")
            return 1
        if not run_rados_bench_write(node=cephadm, **rep_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        run_rados_bench_read(node=cephadm, **rep_config)
        log.info(
            "Created the replicated Pool, Finished writing data into the pool")
        if rep_config.get("delete_pool"):
            if not detete_pool(node=cephadm, pool=rep_config["pool_name"]):
                log.error("Failed to delete replicated Pool")
                return 1

    if config.get("email_alerts"):
        alert_config = config.get("email_alerts")
        if not enable_email_alerts(node=cephadm, **alert_config):
            log.error("Error while configuring email alerts")
            return 1
        log.info("email alerts configured")

    if config.get("log_to_file"):
        if not set_logging_to_file(node=cephadm):
            log.error("Error while setting config to enable logging into file")
            return 1
        log.info("Logging to file configured")

    if config.get("cluster_configuration_checks"):
        cls_config = config.get("cluster_configuration_checks")
        if not set_cluster_configuration_checks(node=cephadm, **cls_config):
            log.error("Error while setting Cluster config checks")
            return 1
        log.info("Set up cluster configuration checks")

    if config.get("configure_balancer"):
        balancer_config = config.get("configure_balancer")
        if not enable_balancer(node=cephadm, **balancer_config):
            log.error("Error while setting up balancer on the Cluster")
            return 1
        log.info("Set up Balancer on the cluster")

    if config.get("configure_pg_autoscaler"):
        autoscaler_config = config.get("configure_pg_autoscaler")
        if not configure_pg_autoscaler(node=cephadm, **autoscaler_config):
            log.error("Error while setting up pg_autoscaler on the Cluster")
            return 1
        log.info("Set up pg_autoscaler on the cluster")

    log.info("All Pre-requisites completed to run Rados suite")
    return 0
def run(ceph_cluster, **kw) -> int:
    """
    Test to copy data from one pool to another
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    pool_obj = PoolFunctions(node=cephadm)
    client_node = ceph_cluster.get_nodes(role="client")[0]

    pool_configs_path = "conf/pacific/rados/test-confs/pool-configurations.yaml"

    with open(pool_configs_path, "r") as fd:
        pool_configs = yaml.safe_load(fd)

    pool_orig = pool_configs[config["pool-1-type"]][config["pool-1-conf"]]
    pool_target = pool_configs[config["pool-2-type"]][config["pool-2-conf"]]
    create_given_pool(rados_obj, pool_orig)
    create_given_pool(rados_obj, pool_target)

    # Writing objects with huge omap entries
    if not pool_obj.fill_omap_entries(pool_name=pool_orig["pool_name"],
                                      obj_end=500):
        log.error(
            f"Omap entries not generated on pool {pool_orig['pool_name']}")
        return 1

    do_rados_put(mon=client_node, pool=pool_orig["pool_name"], nobj=1000)

    snapshots = []
    for _ in range(5):
        snap = pool_obj.create_pool_snap(pool_name=pool_orig["pool_name"])
        if snap:
            snapshots.append(snap)
        else:
            log.error("Could not create snapshot on the pool")
            return 1

    # Using cppool to copy contents b/w the pools
    cmd = f"rados cppool {pool_orig['pool_name']} {pool_target['pool_name']}"
    client_node.exec_command(sudo=True, cmd=cmd, long_running=True)

    # Sleeping for 2 seconds after copy to perform get operations
    time.sleep(2)

    do_rados_get(client_node, pool_target["pool_name"], 1)

    # Checking if the snapshots of pool was also copied
    # Snapshots of pool should not be copied
    for snap_name in snapshots:
        if pool_obj.check_snap_exists(snap_name=snap_name,
                                      pool_name=pool_target["pool_name"]):
            log.error("Snapshot of pool exists")
            return 1

    # deleting the Target pool created after cppool
    rados_obj.detete_pool(pool=pool_target["pool_name"])

    # Creating new target pool to test import/export
    create_given_pool(rados_obj, pool_target)

    # Creating temp file to hold pool info
    client_node.exec_command(cmd="touch /tmp/file", )

    # crating export of data on old pool
    cmd = f"rados export -p {pool_orig['pool_name']} /tmp/file"
    client_node.exec_command(sudo=True, cmd=cmd, long_running=True)

    # Importing the file into the new pool
    cmd = f"rados import -p {pool_target['pool_name']} /tmp/file"
    client_node.exec_command(sudo=True, cmd=cmd, long_running=True)

    # Sleeping for 2 seconds after copy to perform get operations
    time.sleep(2)

    do_rados_get(client_node, pool_target["pool_name"], 1)

    # Checking if the snapshots of pool was also copied
    # Snapshots of pool should not be copied
    for snap_name in snapshots:
        if pool_obj.check_snap_exists(snap_name=snap_name,
                                      pool_name=pool_target["pool_name"]):
            log.error("Snapshot of pool exists")
            return 1

    # deleting the Original & Target pool created after cppool
    rados_obj.detete_pool(pool=pool_target["pool_name"])
    rados_obj.detete_pool(pool=pool_orig["pool_name"])
    return 0
Beispiel #15
0
def run(ceph_cluster, **kw):
    """
    Test to create a large number of omap entries on the single PG pool and test osd resiliency
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    pool_obj = PoolFunctions(node=cephadm)

    pool_target_configs = config["verify_osd_omap_entries"]["configurations"]
    omap_target_configs = config["verify_osd_omap_entries"]["omap_config"]

    # Creating pools and starting the test
    for entry in pool_target_configs.values():
        log.debug(
            f"Creating {entry['pool_type']} pool on the cluster with name {entry['pool_name']}"
        )
        if entry.get("pool_type", "replicated") == "erasure":
            method_should_succeed(rados_obj.create_erasure_pool,
                                  name=entry["pool_name"],
                                  **entry)
        else:
            method_should_succeed(
                rados_obj.create_pool,
                **entry,
            )

        log.debug(
            "Created the pool. beginning to create large number of omap entries on the pool"
        )
        if not pool_obj.fill_omap_entries(pool_name=entry["pool_name"],
                                          **omap_target_configs):
            log.error(
                f"Omap entries not generated on pool {entry['pool_name']}")
            return 1

        # Fetching the current acting set for the pool
        acting_set = rados_obj.get_pg_acting_set(pool_name=entry["pool_name"])
        rados_obj.change_recover_threads(config={}, action="set")
        log.debug(
            f"Proceeding to restart OSd's from the acting set {acting_set}")
        for osd_id in acting_set:
            rados_obj.change_osd_state(action="stop", target=osd_id)
            # sleeping for 5 seconds for re-balancing to begin
            time.sleep(5)

            # Waiting for cluster to get clean state after OSD stopped
            if not wait_for_clean_pg_sets(rados_obj):
                log.error("PG's in cluster are not active + Clean state.. ")
                return 1
            rados_obj.change_osd_state(action="restart", target=osd_id)
            log.debug(
                f"Cluster reached clean state after osd {osd_id} stop and restart"
            )

        rados_obj.change_recover_threads(config={}, action="rm")
        # deleting the pool created after the test
        rados_obj.detete_pool(pool=entry["pool_name"])

        log.info(
            f"All the OSD's from the acting set {acting_set} were restarted "
            f"and object movement completed for pool {entry['pool_name']}")

    log.info(
        "Completed testing effects of large number of omap entries on pools ")
    return 0
Beispiel #16
0
def run(ceph_cluster, **kw):
    """
    Performs various pool related validation tests
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)
    pool_obj = PoolFunctions(node=cephadm)

    if config.get("ec_pool_recovery_improvement"):
        ec_config = config.get("ec_pool_recovery_improvement")
        if not rados_obj.create_erasure_pool(name="recovery", **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if not rados_obj.bench_write(**ec_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**ec_config)
        log.info("Created the EC Pool, Finished writing data into the pool")

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        log.info(
            f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        log.info(
            "Stopped 'm' number of OSD's from, starting to wait for recovery")
        rados_obj.change_recover_threads(config=ec_config, action="set")

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "recovering",
                    "recovery_wait",
                    "backfilling_wait",
                    "peered",
                    "undersized",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info(
                    "The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 1 minute")
            time.sleep(60)

        # getting the acting set for the created pool after recovery
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        # Changing recovery threads back to default
        rados_obj.change_recover_threads(config=ec_config, action="rm")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # Sleep for 5 seconds for OSD's to join the cluster
        time.sleep(5)

        if not flag:
            log.error(
                "The pool did not reach active + Clean state after recovery")
            return 1

        # Deleting the pool created
        if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
            log.error(
                f"the pool {ec_config['pool_name']} could not be deleted")
            return 1

        log.info("Successfully tested EC pool recovery with K osd's surviving")
        return 0

    if config.get("Compression_tests"):
        """
        Create a 2 replicated pools:
        1. Pool_1 : enable any compression algorithm(def snappy) and compression mode(aggressive/force).
        2. Pool_2 : set compression mode to none
        Writing the same amount of data on 2 pools, size of pool with compression on would consume less space
        """
        pool_config = config["Compression_tests"]["pool_config"]
        compression_config = config["Compression_tests"]["compression_config"]
        pool_1 = pool_config["pool-1"]
        pool_2 = pool_config["pool-2"]

        if config["Compression_tests"]["pool_type"] == "replicated":
            if not rados_obj.create_pool(pool_name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            if not rados_obj.create_pool(pool_name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
        elif config["Compression_tests"]["pool_type"] == "erasure":
            pool_config["pool_name"] = pool_1
            if not rados_obj.create_erasure_pool(name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            pool_config["pool_name"] = pool_2
            if not rados_obj.create_erasure_pool(name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
            del pool_config["pool_name"]

        log.debug("Created two pools to test compression")

        # Enabling compression on pool-1
        if not rados_obj.pool_inline_compression(pool_name=pool_1,
                                                 **compression_config):
            log.error(
                f"Error setting compression on pool : {pool_1} for config {compression_config}"
            )
            return 1

        # Writing the same amount of data into two pools
        if not rados_obj.bench_write(pool_name=pool_1, **pool_config):
            log.error(
                "Failed to write objects into Pool-1, with compression enabled"
            )
            return 1

        if not rados_obj.bench_write(pool_name=pool_2, **pool_config):
            log.error(
                "Failed to write objects into Pool-2, without compression enabled"
            )
            return 1
        # Sleeping for 5 seconds for status to be updated.
        time.sleep(5)

        log.debug(
            "Finished writing data into the two pools. Checking pool stats")
        try:
            pool_stats = rados_obj.run_ceph_command(
                cmd="ceph df detail")["pools"]
            pool_1_stats = [
                detail for detail in pool_stats if detail["name"] == pool_1
            ][0]["stats"]
            pool_2_stats = [
                detail for detail in pool_stats if detail["name"] == pool_2
            ][0]["stats"]
        except KeyError:
            log.error(
                "No stats about the pools requested found on the cluster")
            return 1

        log.debug(f"Pool-1 stats: {pool_1_stats}")
        log.debug(f"Pool-2 stats: {pool_2_stats}")
        if pool_1_stats["compress_bytes_used"] < 0:
            log.error("No data stored under pool-1 is compressed")
            return 1

        if pool_1_stats["kb_used"] >= pool_2_stats["kb_used"]:
            log.error("Compression has no effect on the pool size...")
            return 1

        if config["Compression_tests"].get("verify_compression_ratio_set"):
            # added verification for test: CEPH-83571672
            if not rados_obj.check_compression_size(pool_name=pool_1,
                                                    **compression_config):
                log.error("data not compressed in accordance to ratio set")
                return 1

        log.info("Pool size is less when compression is enabled")
        return 0

    if config.get("test_autoscaler_bulk_feature"):
        """
        Tests to verify the autoscaler bulk flag, which allows pools to make use of
        scale-down profile, making those pools start with full compliments of PG sets.
        Tests include
        1. creating new pools with bulk,
        2. enabling/disabling bulk flag on existing pools
        3. Verify the PG changes when the flag is set/unset
        Verifies bugs : https://bugzilla.redhat.com/show_bug.cgi?id=2049851
        """
        regex = r"\s*(\d.\d)-rhel-\d"
        build = (re.search(regex,
                           config.get("build",
                                      config.get("rhbuild")))).groups()[0]
        if not float(build) > 5.0:
            log.info(
                "Test running on version less than 5.1, skipping verifying bulk flags"
            )
            return 0

        # Creating a pool with bulk feature
        pool_name = config.get("pool_name")
        if not pool_obj.set_bulk_flag(pool_name=pool_name):
            log.error("Failed to create a pool with bulk features")
            return 1

        # Checking the autoscaler status, final PG counts, bulk flags
        pg_target_init = pool_obj.get_target_pg_num_bulk_flag(
            pool_name=pool_name)

        # Unsetting the bulk flag and checking the change in the PG counts
        if not pool_obj.rm_bulk_flag(pool_name=pool_name):
            log.error("Failed to create a pool with bulk features")
            return 1

        # Sleeping for 5 seconds for new PG num to bets et
        time.sleep(5)
        pg_target_interim = pool_obj.get_target_pg_num_bulk_flag(
            pool_name=pool_name)

        # The target PG's once the flag is disabled must be lesser than when enabled
        if pg_target_interim >= pg_target_init:
            log.error("PG's not reduced after bulk flag disabled")
            return 1

        # Setting the bulk flag on pool again and checking the change in the PG counts
        if not pool_obj.set_bulk_flag(pool_name=pool_name):
            log.error("Failed to disable/remove bulk features on pool")
            return 1

        # Sleeping for 5 seconds for new PG num to bets et
        time.sleep(5)

        pg_target_final = pool_obj.get_target_pg_num_bulk_flag(
            pool_name=pool_name)

        # The target PG's once the flag is disabled must be lesser than when enabled
        if pg_target_interim >= pg_target_final:
            log.error("PG's not Increased after bulk flag Enabled")
            return 1

        if config.get("delete_pool"):
            rados_obj.detete_pool(pool=pool_name)
        log.info("Verified the workings of bulk flag")
        return 0

    if config.get("verify_pool_target_ratio"):
        log.debug("Verifying target size ratio on pools")
        target_configs = config["verify_pool_target_ratio"]["configurations"]
        # Creating pools and starting the test
        for entry in target_configs.values():
            log.debug(f"Creating {entry['pool_type']} pool on the cluster")
            if entry.get("pool_type", "replicated") == "erasure":
                method_should_succeed(rados_obj.create_erasure_pool,
                                      name=entry["pool_name"],
                                      **entry)
            else:
                method_should_succeed(
                    rados_obj.create_pool,
                    **entry,
                )
            rados_obj.bench_write(**entry)
            if not pool_obj.verify_target_ratio_set(
                    pool_name=entry["pool_name"],
                    ratio=entry["target_size_ratio"]):
                log.error(
                    f"Could not change the target ratio on the pool: {entry['pool_name']}"
                )
                return 1
            log.debug("Set the ratio. getting the projected pg's")

            rados_obj.change_recover_threads(config=config, action="set")
            log.debug(
                "Waiting for the rebalancing to complete on the cluster after the change"
            )
            # Sleeping for 2 minutes for rebalancing to start & for new PG count to be updated.
            time.sleep(120)

            new_pg_count = int(
                pool_obj.get_pg_autoscaler_value(pool_name=entry["pool_name"],
                                                 item="pg_num_target"))
            if new_pg_count <= entry["pg_num"]:
                log.error(
                    f"Count of PG's not increased on the pool: {entry['pool_name']}"
                    f"Initial creation count : {entry['pg_num']}"
                    f"New count after setting num target : {new_pg_count}")
                return 1

            res = wait_for_clean_pg_sets(rados_obj)
            if not res:
                log.error(
                    "PG's in cluster are not active + Clean after the ratio change"
                )
                return 1
            if not pool_obj.verify_target_ratio_set(
                    pool_name=entry["pool_name"], ratio=0.0):
                log.error(
                    f"Could not remove the target ratio on the pool: {entry['pool_name']}"
                )
                return 1

            # Sleeping for 2 minutes for rebalancing to start & for new PG count to be updated.
            time.sleep(120)
            # Checking if after the removal of ratio, the PG count has reduced
            end_pg_count = int(
                pool_obj.get_pg_autoscaler_value(pool_name=entry["pool_name"],
                                                 item="pg_num_target"))
            if end_pg_count >= new_pg_count:
                log.error(
                    f"Count of PG's not changed/ reverted on the pool: {entry['pool_name']}"
                    f" after removing the target ratios")
                return 1
            rados_obj.change_recover_threads(config=config, action="rm")
            if entry.get("delete_pool", False):
                rados_obj.detete_pool(pool=entry["pool_name"])
            log.info(
                f"Completed the test of target ratio on pool: {entry['pool_name']} "
            )
        log.info("Target ratio tests completed")
        return 0

    if config.get("verify_mon_target_pg_per_osd"):
        pg_conf = config.get("verify_mon_target_pg_per_osd")
        if not mon_obj.set_config(**pg_conf):
            log.error("Could not set the value for mon_target_pg_per_osd ")
            return 1
        mon_obj.remove_config(**pg_conf)
        log.info("Set and verified the value for mon_target_pg_per_osd ")
        return 0

    if config.get("verify_pg_num_min"):
        log.debug("Verifying pg_num_min on pools")
        target_configs = config["verify_pg_num_min"]["configurations"]
        # Creating pools and starting the test
        for entry in target_configs.values():
            log.debug(f"Creating {entry['pool_type']} pool on the cluster")
            if entry.get("pool_type", "replicated") == "erasure":
                method_should_succeed(rados_obj.create_erasure_pool,
                                      name=entry["pool_name"],
                                      **entry)
            else:
                method_should_succeed(
                    rados_obj.create_pool,
                    **entry,
                )
            rados_obj.bench_write(**entry)

            if not rados_obj.set_pool_property(pool=entry["pool_name"],
                                               props="pg_num_min",
                                               value=entry["pg_num_min"]):
                log.error("Could not set the pg_min_size on the pool")
                return 1

            if entry.get("delete_pool", False):
                rados_obj.detete_pool(pool=entry["pool_name"])
            log.info(
                f"Completed the test of pg_min_num on pool: {entry['pool_name']} "
            )
        log.info("pg_min_num tests completed")
        return 0
Beispiel #17
0
def run(ceph_cluster, **kw):
    """
    Changes b/w various election strategies and observes mon quorum behaviour
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonElectionStrategies(rados_obj=rados_obj)
    cephadm_node_mon = ceph_cluster.get_nodes(role="installer")[0]

    # Collecting the number of mons in the quorum before the test
    mon_init_count = len(mon_obj.get_mon_quorum().keys())

    # By default, the election strategy is classic. Verifying that
    strategy = mon_obj.get_election_strategy()
    if strategy != 1:
        log.error(
            f"cluster created election strategy other than classic, i.e {strategy}"
        )
        return 1

    # Changing strategy to 2. i.e disallowed mode.
    if not mon_obj.set_election_strategy(mode="disallow"):
        log.error("could not set election strategy to disallow mode")
        return 1

    # sleeping for 2 seconds for new elections to be triggered and new leader to be elected
    time.sleep(2)

    log.info("Set election strategy to disallow mode. adding disallowed mons")
    # Checking if new leader will be chosen if leader is added to disallowed list
    old_leader = mon_obj.get_mon_quorum_leader()
    if not mon_obj.set_disallow_mon(mon=old_leader):
        log.error(f"could not add mon: {old_leader} to the disallowed list")
        return 1

    # sleeping for 2 seconds for new elections to be triggered and new leader to be elected
    time.sleep(2)

    current_leader = mon_obj.get_mon_quorum_leader()
    if re.search(current_leader, old_leader):
        log.error(
            f"The mon: {old_leader} added to disallow list is still leader")
        return 1

    # removing the mon from the disallowed list
    if not mon_obj.remove_disallow_mon(mon=old_leader):
        log.error(f"could not remove mon: {old_leader} from disallowed list")
        return 1

    # sleeping for 2 seconds for new elections to be triggered and new leader to be elected
    time.sleep(2)

    # Changing strategy to 3. i.e Connectivity mode.
    if not mon_obj.set_election_strategy(mode="connectivity"):
        log.error("could not set election strategy to connectivity mode")
        return 1

    # Checking connectivity scores of all the mons
    cmd = f"ceph daemon mon.{cephadm_node_mon.hostname} connection scores dump"
    rados_obj.run_ceph_command(cmd=cmd)

    # Changing strategy to default
    if not mon_obj.set_election_strategy(mode="classic"):
        log.error("could not set election strategy to classic mode")
        return 1

    # sleeping for 5 seconds for new elections to be triggered and new leader to be elected
    time.sleep(5)

    # Collecting the number of mons in the quorum after the test
    # todo: add other tests to ascertain the health of mon daemons in quorum
    mon_final_count = len(mon_obj.get_mon_quorum().keys())
    if mon_init_count < mon_final_count:
        log.error(
            "There are less mons in the quorum at the end than there before")
        return 1

    log.info("Completed all mon election test cases")
    return 0
Beispiel #18
0
def run(ceph_cluster, **kw):
    """
    Test to create pool, then add , get , delete objects & Snapshots.
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    pool_obj = PoolFunctions(node=cephadm)
    client_node = rados_obj.ceph_cluster.get_nodes(role="client")[0]
    pool_target_configs = config["verify_client_pg_access"]["configurations"]
    num_snaps = config["verify_client_pg_access"]["num_snapshots"]
    log.debug(
        "Verifying the effects of rados put, get, snap & delete on pool with single PG"
    )

    # Creating pools and starting the test
    for entry in pool_target_configs.values():
        pool_name = entry["pool_name"]
        log.debug(
            f"Creating {entry['pool_type']} pool on the cluster with name {pool_name}"
        )
        if entry.get("pool_type", "replicated") == "erasure":
            method_should_succeed(rados_obj.create_erasure_pool,
                                  name=pool_name,
                                  **entry)
        else:
            method_should_succeed(
                rados_obj.create_pool,
                **entry,
            )

        # Creating and reading objects
        with parallel() as p:
            p.spawn(do_rados_put, client_node, pool_name, 500)
            p.spawn(do_rados_get, client_node, pool_name, 1)

        # Creating and deleting snapshots on the pool
        snapshots = []
        for _ in range(num_snaps):
            snap = pool_obj.create_pool_snap(pool_name=pool_name)
            if snap:
                snapshots.append(snap)
            else:
                log.error("Could not create snapshot on the pool")
                return 1

        if not pool_obj.delete_pool_snap(pool_name=pool_name):
            log.error("Could not delete the snapshots created")
            return 1

        # Deleting the objects created on the pool
        if not pool_obj.do_rados_delete(pool_name=pool_name):
            log.error("Could not delete the objects present on pool")
            return 1

        rados_obj.detete_pool(pool=pool_name)
        log.info(f"Completed all operations on pool {pool_name}")

    log.info(
        "Completed testing effects of rados put, get, snap & delete on pool with single PG"
    )
    return 0
def run(ceph_cluster, **kw):

    osd_scrub_min_interval = 1800
    osd_scrub_max_interval = 3600
    osd_deep_scrub_interval = 3600

    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosScrubber(node=cephadm)
    # Storing the pg dump log before setting the scrub parameters
    before_scrub_log = rados_obj.get_pg_dump("pgid", "last_scrub_stamp")

    # Preparation of cofiguration parameter values from the current
    # cluster time
    try:
        (
            scrub_begin_hour,
            scrub_begin_weekday,
            scrub_end_hour,
            scrub_end_weekday,
        ) = rados_obj.add_begin_end_hours(0, 1)

        # Scenario to verify that scrub start and end hours are same
        # CEPH-9362
        if config.get("scenario") == "begin_end_time_equal":
            log.info(f'{"Setting scrub start and end hour same"}')
            scrub_end_hour = scrub_begin_hour

        # Begin time is greater than end hour  and current time less than end hour
        # CEPH-9363&CEPH-9364
        if config.get("scenario") == "beginTime gt endTime":
            log.info(
                f'{"Setting scrub start time is greater than end hour and current time less than end hour"}'
            )
            scrub_end_hour, scrub_begin_hour = scrub_begin_hour, scrub_end_hour

        # set begin_hour > end_hour and current_time > end_hour
        # CEPH-9365&CEPH-9366
        if config.get("scenario") == "beginTime gt endTime lt currentTime":
            (
                scrub_begin_hour,
                scrub_begin_weekday,
                scrub_end_hour,
                scrub_end_weekday,
            ) = rados_obj.add_begin_end_hours(-1, -1)
            log.info(
                f'{"Setting scrub start is greater then end_hour and current time is greater than end hour"}'
            )
        # set begin hour and end hour greater than current time
        # CEPH-9367 & CEPH 9368
        if (config.get("scenario") == "beginTime and endTime gt currentTime"
                or config.get("scenario") == "decreaseTime"):
            (
                scrub_begin_hour,
                scrub_begin_weekday,
                scrub_end_hour,
                scrub_end_weekday,
            ) = rados_obj.add_begin_end_hours(2, 3)
            log.info(
                f'{"Setting scrub start is greater then end_hour and current time is greater than end hour"}'
            )
        # unset the scrub.CEPH-9374
        if config.get("scenario") == "unsetScrub":
            rados_obj.set_osd_flags("set", "noscrub")

        # Setting the scrub parameters
        rados_obj.set_osd_configuration("osd_scrub_min_interval",
                                        osd_scrub_min_interval)
        rados_obj.set_osd_configuration("osd_scrub_max_interval",
                                        osd_scrub_max_interval)
        rados_obj.set_osd_configuration("osd_deep_scrub_interval",
                                        osd_deep_scrub_interval)
        rados_obj.set_osd_configuration("osd_scrub_begin_week_day",
                                        scrub_begin_weekday)
        rados_obj.set_osd_configuration("osd_scrub_end_week_day",
                                        scrub_end_weekday)
        rados_obj.set_osd_configuration("osd_scrub_begin_hour",
                                        scrub_begin_hour)
        rados_obj.set_osd_configuration("osd_scrub_end_hour", scrub_end_hour)

        if config.get("scenario") == "decreaseTime":
            (
                scrub_begin_hour,
                scrub_begin_weekday,
                scrub_end_hour,
                scrub_end_weekday,
            ) = rados_obj.add_begin_end_hours(0, 1)

            rados_obj.set_osd_configuration("osd_scrub_begin_week_day",
                                            scrub_begin_weekday)
            rados_obj.set_osd_configuration("osd_scrub_end_week_day",
                                            scrub_end_weekday)
            rados_obj.set_osd_configuration("osd_scrub_begin_hour",
                                            scrub_begin_hour)
            rados_obj.set_osd_configuration("osd_scrub_end_hour",
                                            scrub_end_hour)

        # Scheduled scrub verification
        endTime = datetime.datetime.now() + datetime.timedelta(minutes=90)
        while datetime.datetime.now() <= endTime:
            after_scrub_log = rados_obj.get_pg_dump("pgid", "last_scrub_stamp")
            scrub_status = rados_obj.verify_scrub(before_scrub_log,
                                                  after_scrub_log)
            if scrub_status == 0 and (
                    config.get("scenario") == "default"
                    or config.get("scenario") == "begin_end_time_equal"
                    or config.get("scenario") == "beginTime gt endTime"
                    or config.get("scenario") == "decreaseTime"):
                log.info(f'{"Scrubbing validation is success"}')
                return 0
            log.info(f'{"Scrubbing validation is in progress..."}')
            time.sleep(240)

        if (config.get("scenario") == "beginTime gt endTime lt currentTime"
                or config.get("scenario")
                == "beginTime and endTime gt currentTime"
                or config.get("scenario") == "unsetScrub"):
            log.info(f'{"Scrubbing validation is success"}')
            return 0
        log.info(f'{"Scrubbing failed"}')
        return 1
    except Exception as err:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        sys.stderr.write("ERRORED DESC\t::%s:\n" % str(err))
        sys.stderr.write("ERRORED MODULE\t::%s:\n" % str(exc_type))
        sys.stderr.write("ERRORED FILE\t::%s:\n" % str(fname))
        sys.stderr.write("ERRORED LINE\t::%s:\n" % str(exc_tb.tb_lineno))
        return 1
    finally:
        set_default_params(rados_obj)
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """

    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonElectionStrategies(rados_obj=rados_obj)
    client_node = ceph_cluster.get_nodes(role="client")[0]

    site1_name = config["site1"]["name"]
    site2_name = config["site2"]["name"]

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    # Sleeping for 2 seconds after map update.
    time.sleep(2)

    # Setting the election strategy to connectivity mode
    if not mon_obj.set_election_strategy(mode="connectivity"):
        log.error("could not set election strategy to connectivity mode")
        return 1

    # Sleeping for 2 seconds after strategy update.
    time.sleep(2)

    # Checking updated election strategy in mon map
    strategy = mon_obj.get_election_strategy()
    if strategy != 3:
        log.error(
            f"cluster created election strategy other than connectivity, i.e {strategy}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    # Creating new datacenter crush objects and moving under root/default
    for name in [site1_name, site2_name]:
        cmd = f"ceph osd crush add-bucket {name} datacenter"
        rados_obj.run_ceph_command(cmd)
        time.sleep(2)
        move_crush_item(cephadm, crush_obj=name, name="root", value="default")
        time.sleep(2)

    # Moving all the OSD and Mon daemons into respective sites
    sites = ["site1", "site2", "site3"]
    for site in sites:
        mon_hosts = [
            host_obj.hostname
            for host_obj in ceph_cluster.get_nodes(role="mon")
        ]
        log.info(f"Mon hosts defined: {mon_hosts}")
        osd_hosts = [
            host_obj.hostname
            for host_obj in ceph_cluster.get_nodes(role="osd")
        ]
        log.info(f"OSD hosts defined: {osd_hosts}")
        # Collecting hosts from each site and setting locations accordingly
        site_details = config[site]
        crush_name = site_details["name"]
        host_nodes = cephadm.cluster.get_nodes()

        for item in site_details["hosts"]:
            host = [
                node for node in host_nodes if re.search(item, node.hostname)
            ][0]
            # Moving the mon daemons into site
            if host.hostname in mon_hosts:
                cmd = f"ceph mon set_location {host.hostname} datacenter={crush_name}"
                cephadm.shell([cmd])
                log.info(
                    f"Set location for mon {host.hostname} onto site {crush_name}\n"
                    "sleeping for 5 seconds")
                time.sleep(5)

            # Moving the osd daemons into site
            if host.hostname in osd_hosts:
                move_crush_item(
                    node=cephadm,
                    crush_obj=host.hostname,
                    name="datacenter",
                    value=crush_name,
                )
                log.info(
                    f"Set location for OSD {host.hostname} onto site {crush_name}\n"
                    "sleeping for 5 seconds")
                time.sleep(5)

    log.info("Moved all the hosts into respective sites")

    stretch_rule_name = config.get("stretch_rule_name", "stretch_rule")
    if not setup_crush_rule(
            node=client_node,
            rule_name=stretch_rule_name,
            site1=site1_name,
            site2=site2_name,
    ):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)

    # Enabling the stretch cluster mode
    tiebreaker_node = get_node_by_id(cephadm.cluster,
                                     config["site3"]["hosts"][0])
    log.info(f"tiebreaker node provided: {tiebreaker_node.hostname}")
    cmd = f"ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1
    time.sleep(2)

    # wait for PG's to settle down with new crush rules after deployment of stretch mode
    wait_for_clean_pg_sets(rados_obj)

    # Checking if the pools have been updated with the new crush rules
    acting_set = rados_obj.get_pg_acting_set()
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0
Beispiel #21
0
def run(ceph_cluster, **kw):
    """
    Cephadm Bootstrap, Managing hosts with options and
    full cluster deployment at single call are supported.

    Args:
        ceph_cluster (ceph.ceph.Ceph): Ceph cluster object
        kw: test data

    The test data should be framed as per the below support,

    - Bootstrap cluster with default or custom image and
      returns after cephadm.bootstrap. To use default image, set 'registry'.

        Example:
            config:
                command: bootstrap
                base_cmd_args:
                    verbose: true
                args:
                    custom_image: true | false
                    mon-ip: <node_name>
                    mgr-id: <mgr_id>
                    fsid: <id>

    - Manage host operations like,
        - Add hosts with/without labels and IP address
        - Add/Remove labels to/from existing node
        - Set Address to node.
        - Remove hosts

        host_ops keys are definition names are defined under
        CephAdmin.HostMixin should be used to call that respective method.

        supported definition names for host_ops are host_add, attach_label,
        remove_label, set_address and host_remove.

        for example.,
        - test:
            name: Add host
            desc: Add new host node with IP address
            module: test_cephadm.py
            config:
                command: host
                service: add | remove | label_add | label_remove | set_address
                base_cmd_args:
                  nodes:
                    - "node3"
                  attach_address: true
                  add_label: false

    """
    log.info("Running cephadm test")
    config = kw.get("config")

    build = config.get("build", config.get("rhbuild"))
    ceph_cluster.rhcs_version = build

    if config.get("skip_setup") is True:
        log.info("Skipping setup of ceph cluster")
        return 0

    # Manage Ceph using ceph-admin orchestration
    command = config.pop("command")
    service = config.pop("service", "")

    log.info("Executing %s %s" % (service, command))

    if command in CephAdmin.direct_calls:
        cephadm = CephAdmin(cluster=ceph_cluster, **config)
        method = fetch_method(cephadm, command)
    elif service in Host.SERVICE_NAME:
        host = Host(cluster=ceph_cluster, **config)
        method = fetch_method(host, command)
    else:
        raise NotImplementedError

    if "shell" in command:
        method(args=config["args"])
    else:
        method(config)

    return 0
Beispiel #22
0
def run(ceph_cluster, **kw):
    """
    Test to Verify the pg-autoscale flag.
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)
    pool_configs_path = "conf/pacific/rados/test-confs/pool-configurations.yaml"

    regex = r"\s*(\d.\d)-rhel-\d"
    build = (re.search(regex, config.get("build",
                                         config.get("rhbuild")))).groups()[0]
    if not float(build) > 5.0:
        log.info(
            "Test running on version less than 5.1, skipping verifying autoscaler flags"
        )
        return 0

    # Setting the no-autoscale flag
    cmd = "ceph osd pool set noautoscale"
    rados_obj.run_ceph_command(cmd=cmd)

    # sleeping for 5 seconds as the command takes some time to affect the status of pools
    time.sleep(5)

    # Getting the autoscale configurations after setting the flag
    # all the pools should have autoscale set to off
    cmd = "ceph osd pool autoscale-status"
    pool_status = rados_obj.run_ceph_command(cmd=cmd)

    for entry in pool_status:
        if entry["pg_autoscale_mode"] == "on":
            log.error(
                f"Pg autoscaler not turned off for pool : {entry['pool_name']}"
            )
            return 1

    if not mon_obj.verify_set_config(section="global",
                                     name="osd_pool_default_pg_autoscale_mode",
                                     value="off"):
        log.error(
            "Default autoscale mode not set to off upon setting the no-autoscale flag"
        )
        return 1

    if not mon_obj.verify_set_config(
            section="mgr", name="mgr/pg_autoscaler/noautoscale", value="true"):
        log.error(
            "autoscale Flag not set to true upon setting the no-autoscale flag"
        )
        return 1

    # Creating a new pool, with the flag off, new pool should be created with autoscaler profile turned off
    with open(pool_configs_path, "r") as fd:
        pool_configs = yaml.safe_load(fd)

    pool_conf = pool_configs["replicated"]["sample-pool-2"]
    create_given_pool(rados_obj, pool_conf)

    cmd = "ceph osd pool autoscale-status"
    pool_status = rados_obj.run_ceph_command(cmd=cmd)

    for entry in pool_status:
        if entry["pool_name"] == pool_conf["pool_name"]:
            if entry["pg_autoscale_mode"] == "on":
                log.error(
                    f"Pg autoscaler not turned off for the new pool : {entry['pool_name']} "
                    f"created with flag turned off")
                return 1

    # Turning the autoscale flag back on. All the setting made earlier should be reverted
    cmd = "ceph osd pool unset noautoscale"
    pool_status = rados_obj.run_ceph_command(cmd=cmd)

    # sleeping for 5 seconds as the command takes some time to affect the status of pools
    time.sleep(5)

    for entry in pool_status:
        if entry["pg_autoscale_mode"] == "off":
            log.error(
                f"Pg autoscaler not turned on for pool : {entry['pool_name']}")
            return 1

    if not mon_obj.verify_set_config(section="global",
                                     name="osd_pool_default_pg_autoscale_mode",
                                     value="on"):
        log.error(
            "Default autoscale mode not set to true upon removing the no-autoscale flag"
        )
        return 1

    if not mon_obj.verify_set_config(section="mgr",
                                     name="mgr/pg_autoscaler/noautoscale",
                                     value="false"):
        log.error(
            "autoscale Flag not set to false upon removing the no-autoscale flag"
        )
        return 1

    # Deleting the pool created earlier
    if not rados_obj.detete_pool(pool=pool_conf["pool_name"]):
        log.error(f"the pool {pool_conf['pool_name']} could not be deleted")
        return 1

    log.info("Autoscale flag is working as expected.")
    return 0
Beispiel #23
0
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Actions Performed:
    1. Disables the automatic crush map update
    2. Collects the OSD daemons in the cluster and split them into 2 sites.
    3. If add capacity is selected, only half of the OSD's will be added to various sites initially.
    4. Adds the stretch rule into crush map.
    5. Adding monitors into the 2 sites.
    6. Create a replicated pool and deploy stretch mode.
    7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites )
    8. Check for the bump in election epochs throughout.
    9. Check the acting set in PG for 4 OSD's. 2 from each site.

    Verifies bugs:
    [1]. https://bugzilla.redhat.com/show_bug.cgi?id=1937088
    [2]. https://bugzilla.redhat.com/show_bug.cgi?id=1952763
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    client_node = ceph_cluster.get_nodes(role="client")[0]
    tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0]

    if not client_node and not tiebreaker_node:
        log.error(
            "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster"
        )
        return 1
    mon_state = get_mon_details(node=cephadm)
    if len(list(mon_state["monitors"])) < 5:
        log.error(
            f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}"
        )
        return 1
    osd_details = get_osd_details(node=cephadm)
    if len(osd_details.keys()) < 4:
        log.error(
            f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}"
        )
        return 1

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    site1 = config.get("site1", "site1")
    site2 = config.get("site2", "site2")

    # Collecting osd details and split them into Sita A and Site B
    sorted_osds = sort_osd_sites(all_osd_details=osd_details)
    site_a_osds = sorted_osds[0]
    site_b_osds = sorted_osds[1]
    if config.get("perform_add_capacity"):
        site_a_osds = sorted_osds[0][: (len(sorted_osds[0]) // 2)]
        site_b_osds = sorted_osds[1][: (len(sorted_osds[1]) // 2)]

    if not set_osd_sites(
        node=cephadm,
        osds=site_a_osds,
        site=site1,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    if not set_osd_sites(
        node=cephadm,
        osds=site_b_osds,
        site=site2,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    # collecting mon map to be compared after strtech cluster deployment
    stretch_rule_name = "stretch_rule"
    if not setup_crush_rule(
        node=client_node, rule_name=stretch_rule_name, site1=site1, site2=site2
    ):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Setting the election strategy to connectivity mode
    cmd = "/bin/ceph mon set election_strategy connectivity"
    cephadm.shell([cmd])

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)
    init_mon_state = get_mon_details(node=cephadm)

    # Checking if mon elections happened after changing election strategy
    if mon_state["epoch"] > init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after setting the connectivity mode.")
        return 1

    # Checking updated election strategy in mon map
    if init_mon_state["election_strategy"] != 3:
        log.error(
            f"Election strategy is not connectivity mode.\n Currently set {mon_state['election_strategy']}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    log.info(f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3")
    if not set_mon_sites(
        node=cephadm, tiebreaker_node=tiebreaker_node, site1=site1, site2=site2
    ):
        log.error("Failed to ad monitors into respective sites")
        return 1

    # All the existing pools should be automatically changed with stretch rule. Creating a test pool
    pool_name = "test_pool_1"
    if not create_pool(
        node=cephadm, disable_pg_autoscale=True, pool_name=pool_name, pg_num=16
    ):
        log.error("Failed to create the replicated Pool")
        return 1

    log.info("Monitors added to respective sites. enabling stretch rule")
    cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1

    if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after Enabling strech mode")
        return 1

    if config.get("perform_add_capacity"):
        pool_name = "test_stretch_pool"
        if not create_pool(
            node=cephadm,
            disable_pg_autoscale=True,
            pool_name=pool_name,
            crush_rule=stretch_rule_name,
        ):
            log.error("Failed to create the replicated Pool")
            return 1
        do_rados_put(mon=client_node, pool=pool_name, nobj=1000)

        # Increasing backfill/rebalance threads so that cluster will re-balance it faster after add capacity
        change_recover_threads(node=cephadm, config=config, action="set")

        log.info("Performing add Capacity after the deployment of stretch cluster")
        site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds]
        site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds]

        if not set_osd_sites(
            node=cephadm,
            osds=site_a_osds,
            site=site1,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1
        if not set_osd_sites(
            node=cephadm,
            osds=site_b_osds,
            site=site2,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1

        # Waiting for up to 2.5 hours for the PG's to enter active + Clean state after add capacity
        # Automation for bug : [1] & [2]
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        flag = True
        while end_time > datetime.datetime.now():
            status_report = run_ceph_command(node=cephadm, cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "remapped",
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "peering",
                    "recovering",
                    "recovery_wait",
                    "peering",
                    "undersized",
                    "backfilling_wait",
                )
                flag = (
                    False
                    if any(key in rec for key in entry["state"].split("+"))
                    else True
                )

            if flag:
                log.info("The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 2 minutes"
            )
            time.sleep(120)
        change_recover_threads(node=cephadm, config=config, action="rm")
        if not flag:
            log.error(
                "The cluster did not reach active + Clean state after add capacity"
            )
            return 1

        with parallel() as p:
            p.spawn(do_rados_get, client_node, pool_name, 10)
            for res in p:
                log.info(res)

    # Checking if the pools have been updated with the new crush rules
    acting_set = get_pg_acting_set(node=cephadm, pool_name=pool_name)
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0
def run(ceph_cluster, **kw):
    """
    Automates OSD re-balance test scenarios.
    1. Create replicated and/or erasure pool/pools
    2. Identify the first osd to be removed
    3. Fetch the host by daemon_type=osd and osd id
    4. Fetch container id and device path
    5. Mark osd out and wait for pgs to be active+clean
    6. Remove OSD
    7. Zap device and wait for device not present
    8. Identify the second osd to be removed
    9. Fetch the host by daemon_type=osd and osd id
    10. Fetch container id and device path
    11. Mark osd out
    12. Add first osd and wait for device present and pgs to be active+clean
    """
    try:
        log.info(run.__doc__)
        config = kw["config"]
        cephadm = CephAdmin(cluster=ceph_cluster, **config)
        rados_obj = RadosOrchestrator(node=cephadm)
        client_node = ceph_cluster.get_nodes(role="client")[0]

        log.info("Running osd in progress rebalance tests")
        pool = create_pools(config, rados_obj, client_node)
        should_not_be_empty(pool, "Failed to retrieve pool details")
        write_to_pools(config, rados_obj, client_node)
        rados_obj.change_recover_threads(config=pool, action="set")
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=pool["pool_name"])
        log.info(f"Acting set {acting_pg_set}")
        should_not_be_empty(acting_pg_set, "Failed to retrieve acting pg set")
        osd_id = acting_pg_set[0]
        host = rados_obj.fetch_host_node(daemon_type="osd", daemon_id=osd_id)
        should_not_be_empty(host, "Failed to fetch host details")
        dev_path = get_device_path(host, osd_id)
        log.debug(
            f"osd1 device path  : {dev_path}, osd_id : {osd_id}, host.hostname : {host.hostname}"
        )
        utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=True)
        method_should_succeed(utils.set_osd_out, ceph_cluster, osd_id)
        method_should_succeed(wait_for_clean_pg_sets, rados_obj)
        utils.osd_remove(ceph_cluster, osd_id)
        method_should_succeed(wait_for_clean_pg_sets, rados_obj)
        method_should_succeed(utils.zap_device, ceph_cluster, host.hostname,
                              dev_path)
        method_should_succeed(wait_for_device, host, osd_id, action="remove")
        osd_id1 = acting_pg_set[1]
        host1 = rados_obj.fetch_host_node(daemon_type="osd", daemon_id=osd_id1)
        should_not_be_empty(host1, "Failed to fetch host details")
        dev_path1 = get_device_path(host1, osd_id1)
        log.debug(
            f"osd2 device path  : {dev_path1}, osd_id : {osd_id1}, host.hostname : {host1.hostname}"
        )
        method_should_succeed(utils.set_osd_out, ceph_cluster, osd_id1)
        utils.add_osd(ceph_cluster, host.hostname, dev_path, osd_id)
        method_should_succeed(wait_for_device, host, osd_id, action="add")
        method_should_succeed(wait_for_clean_pg_sets, rados_obj)

        acting_pg_set1 = rados_obj.get_pg_acting_set(
            pool_name=pool["pool_name"])
        if len(acting_pg_set) != len(acting_pg_set1):
            log.error(
                f"Acting pg set count before {acting_pg_set} and after {acting_pg_set1} rebalance mismatched"
            )
            return 1

        if pool.get("rados_put", False):
            do_rados_get(client_node, pool["pool_name"], 1)
        utils.set_osd_devices_unamanged(ceph_cluster, unmanaged=False)
        rados_obj.change_recover_threads(config=pool, action="rm")
        if config.get("delete_pools"):
            for name in config["delete_pools"]:
                method_should_succeed(rados_obj.detete_pool, name)
            log.info("deleted all the given pools successfully")
        return 0
    except Exception as e:
        log.info(e)
        log.info(traceback.format_exc())
        return 1
def run(ceph_cluster: Ceph, **kwargs) -> int:
    """
    Return the status of the test execution run with the provided keyword arguments.

    Unlike other test suites, "steps" has been introduced to support workflow style
    execution along with customization.

    Args:
        ceph_cluster: Ceph cluster object
        kwargs:     Key/value pairs of configuration information to be used in the test.

    Returns:
        int - 0 when the execution is successful else 1 (for failure).

    Example:
        - test:
            name: cluster deployment
            desc: Deploy a minimal cluster
            config:
                steps:
                    - config:
                        command: bootstrap
                        service: cephadm
                        base_cmd_args:
                        verbose: true
                        args:
                            mon-ip: node1
                    - config:
                        command: add_hosts
                        service: host
                        args:
                            attach_ip_address: true
                            labels: apply-all-labels
                    - config:
                        command: mon
                        service: mon
                        args:
                            placement:
                                label: mon
                    - config:
                        command: mgr
                        service: mgr
                        args:
                            placement:
                                label: mgr
                    - config:
                        command: apply
                        service: osd
                        args:
                            all-available-devices: true
                    - config:
                        command: shell
                        args:
                            - ceph osd pool create <pool_name> 3 3 replicated
    """
    LOG.info("Starting Ceph cluster deployment.")

    try:
        config = kwargs["config"]
        cephadm = CephAdmin(cluster=ceph_cluster, **config)
        steps = config.get("steps")

        for step in steps:
            cfg = step["config"]

            if cfg["command"] == "shell":
                cephadm.shell(args=cfg["args"])
                continue

            obj = SERVICE_MAP[cfg["service"]](cluster=ceph_cluster, **config)
            func = fetch_method(obj, cfg["command"])
            func(cfg)

        return 0
    except BaseException as be:  # noqa
        LOG.error(be, exc_info=True)
        return 1
Beispiel #26
0
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Actions Performed:
    1. Disables the automatic crush map update
    2. Collects the OSD daemons in the cluster and split them into 2 sites.
    3. If add capacity is selected, only half of the OSD's will be added to various sites initially.
    4. Adds the stretch rule into crush map.
    5. Adding monitors into the 2 sites.
    6. Create a replicated pool and deploy stretch mode.
    7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites )
    8. Check for the bump in election epochs throughout.
    9. Check the acting set in PG for 4 OSD's. 2 from each site.
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    client_node = ceph_cluster.get_nodes(role="client")[0]
    tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0]

    if not client_node and not tiebreaker_node:
        log.error(
            "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster"
        )
        return 1
    mon_state = get_mon_details(node=cephadm)
    if len(list(mon_state["monitors"])) < 5:
        log.error(
            f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}"
        )
        return 1
    osd_details = get_osd_details(node=cephadm)
    if len(osd_details.keys()) < 4:
        log.error(
            f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}"
        )
        return 1

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    # Collecting osd details and split them into Sita A and Site B
    sorted_osds = sort_osd_sites(all_osd_details=osd_details)
    site_a_osds = sorted_osds[0]
    site_b_osds = sorted_osds[1]
    if config.get("perform_add_capacity"):
        site_a_osds = sorted_osds[0][:(len(sorted_osds[0]) // 2)]
        site_b_osds = sorted_osds[1][:(len(sorted_osds[1]) // 2)]

    if not set_osd_sites(
            node=cephadm,
            osds=site_a_osds,
            site=1,
            all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    if not set_osd_sites(
            node=cephadm,
            osds=site_b_osds,
            site=2,
            all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    # collecting mon map to be compared after strtech cluster deployment
    stretch_rule_name = "stretch_rule"
    if not setup_crush_rule(node=client_node, rule_name=stretch_rule_name):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Setting the election strategy to connectivity mode
    cmd = "/bin/ceph mon set election_strategy connectivity"
    cephadm.shell([cmd])

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)
    init_mon_state = get_mon_details(node=cephadm)

    # Checking if mon elections happened after changing election strategy
    if mon_state["epoch"] > init_mon_state["epoch"]:
        log.error(
            "Election epoch not bumped up after setting the connectivity mode."
        )
        return 1

    # Checking updated election strategy in mon map
    if init_mon_state["election_strategy"] != 3:
        log.error(
            f"Election strategy is not connectivity mode.\n Currently set {mon_state['election_strategy']}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    log.info(
        f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3")
    if not set_mon_sites(node=cephadm, tiebreaker_node=tiebreaker_node):
        log.error("Failed to ad monitors into respective sites")
        return 1

    # All the existing pools should be automatically changed with stretch rule. Creating a test pool
    pool_name = "test_pool_1"
    if not create_pool(node=cephadm,
                       disable_pg_autoscale=True,
                       pool_name=pool_name,
                       pg_num=16):
        log.error("Failed to create the replicated Pool")
        return 1

    log.info("Monitors added to respective sites. enabling stretch rule")
    cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1

    if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after Enabling strech mode")
        return 1

    if config.get("perform_add_capacity"):
        pool_name = "test_stretch_pool"
        if not create_pool(
                node=cephadm,
                disable_pg_autoscale=True,
                pool_name=pool_name,
                crush_rule=stretch_rule_name,
        ):
            log.error("Failed to create the replicated Pool")
            return 1
        do_rados_put(mon=client_node, pool=pool_name, nobj=100)

        log.info(
            "Performing add Capacity after the deployment of stretch cluster")
        site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds]
        site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds]

        if not set_osd_sites(
                node=cephadm,
                osds=site_a_osds,
                site=1,
                all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1
        if not set_osd_sites(
                node=cephadm,
                osds=site_b_osds,
                site=2,
                all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1

        # Sleeping for 10 seconds after adding OSD's for the PG re-balancing to start and begin rados get
        time.sleep(10)
        with parallel() as p:
            p.spawn(do_rados_get, client_node, pool_name, 10)
            for res in p:
                log.info(res)

    # Checking if the pools have been updated with the new crush rules
    acting_set = get_pg_acting_set(node=cephadm, pool_name=pool_name)
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0
Beispiel #27
0
def run(ceph_cluster: Ceph, **kwargs) -> int:
    """
    Return the status of the test execution run with the provided keyword arguments.

    Unlike other test suites, "steps" has been introduced to support workflow style
    execution along with customization.

    Args:
        ceph_cluster: Ceph cluster object
        kwargs:     Key/value pairs of configuration information to be used in the test.

    Returns:
        int - 0 when the execution is successful else 1 (for failure).

    Example:
        - test:
            name: cluster deployment
            desc: Deploy a minimal cluster
            config:
                verify_cluster_health: true | false
                steps:
                    - config:
                        command: bootstrap
                        service: cephadm
                        base_cmd_args:
                        verbose: true
                        args:
                            mon-ip: node1
                    - config:
                        command: add_hosts
                        service: host
                        args:
                            attach_ip_address: true
                            labels: apply-all-labels
                    - config:
                        command: mon
                        service: mon
                        args:
                            placement:
                                label: mon
                    - config:
                        command: mgr
                        service: mgr
                        args:
                            placement:
                                label: mgr
                    - config:
                        command: apply
                        service: osd
                        args:
                            all-available-devices: true
                    - config:
                        command: shell
                        args:
                            - ceph osd pool create <pool_name> 3 3 replicated
    """
    LOG.info("Starting Ceph cluster deployment.")
    config = kwargs["config"]
    config["overrides"] = kwargs.get("test_data", {}).get("custom-config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    try:
        steps = config.get("steps", [])
        for step in steps:
            cfg = step["config"]
            command = cfg.pop("command")
            if command == "shell":
                cephadm.shell(**cfg)
                continue

            obj = SERVICE_MAP[cfg["service"]](cluster=ceph_cluster, **config)
            func = fetch_method(obj, command)
            func(cfg)

        if config.get("verify_cluster_health"):
            cephadm.cluster.check_health(
                rhbuild=config.get("rhbuild"), client=cephadm.installer
            )
        if config.get("verify_log_files"):
            isvalid = validate_log_file_after_enable(cephadm)
            if not isvalid:
                LOG.error("Log file validation failure")
                return 1

    except BaseException as be:  # noqa
        LOG.error(be, exc_info=True)
        return 1
    finally:
        # Get cluster state
        get_cluster_state(cephadm)
    return 0
Beispiel #28
0
def run(ceph_cluster, **kw):
    """
    Performs various pool related validation tests
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)

    if config.get("ec_pool_recovery_improvement"):
        ec_config = config.get("ec_pool_recovery_improvement")
        if not rados_obj.create_erasure_pool(name="recovery", **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if not rados_obj.bench_write(**ec_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**ec_config)
        log.info("Created the EC Pool, Finished writing data into the pool")

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        log.info(
            f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        log.info(
            "Stopped 'm' number of OSD's from, starting to wait for recovery")
        rados_obj.change_recover_threads(config=ec_config, action="set")

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "recovering",
                    "recovery_wait",
                    "backfilling_wait",
                    "peered",
                    "undersized",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info(
                    "The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 1 minute")
            time.sleep(60)

        # getting the acting set for the created pool after recovery
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        # Changing recovery threads back to default
        rados_obj.change_recover_threads(config=ec_config, action="rm")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # Sleep for 5 seconds for OSD's to join the cluster
        time.sleep(5)

        if not flag:
            log.error(
                "The pool did not reach active + Clean state after recovery")
            return 1

        # Deleting the pool created
        if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
            log.error(
                f"the pool {ec_config['pool_name']} could not be deleted")
            return 1

        log.info("Successfully tested EC pool recovery with K osd's surviving")
        return 0

    if config.get("Compression_tests"):
        """
        Create a 2 replicated pools:
        1. Pool_1 : enable any compression algorithm(def snappy) and compression mode(aggressive/force).
        2. Pool_2 : set compression mode to none
        Writing the same amount of data on 2 pools, size of pool with compression on would consume less space
        """
        pool_config = config["Compression_tests"]["pool_config"]
        compression_config = config["Compression_tests"]["compression_config"]
        pool_1 = pool_config["pool-1"]
        pool_2 = pool_config["pool-2"]

        if config["Compression_tests"]["pool_type"] == "replicated":
            if not rados_obj.create_pool(pool_name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            if not rados_obj.create_pool(pool_name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
        elif config["Compression_tests"]["pool_type"] == "erasure":
            pool_config["pool_name"] = pool_1
            if not rados_obj.create_erasure_pool(name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            pool_config["pool_name"] = pool_2
            if not rados_obj.create_erasure_pool(name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
            del pool_config["pool_name"]

        log.debug("Created two pools to test compression")

        # Enabling compression on pool-1
        if not rados_obj.pool_inline_compression(pool_name=pool_1,
                                                 **compression_config):
            log.error(
                f"Error setting compression on pool : {pool_1} for config {compression_config}"
            )
            return 1

        # Writing the same amount of data into two pools
        if not rados_obj.bench_write(pool_name=pool_1, **pool_config):
            log.error(
                "Failed to write objects into Pool-1, with compression enabled"
            )
            return 1

        if not rados_obj.bench_write(pool_name=pool_2, **pool_config):
            log.error(
                "Failed to write objects into Pool-2, without compression enabled"
            )
            return 1
        # Sleeping for 5 seconds for status to be updated.
        time.sleep(5)

        log.debug(
            "Finished writing data into the two pools. Checking pool stats")
        try:
            pool_stats = rados_obj.run_ceph_command(
                cmd="ceph df detail")["pools"]
            pool_1_stats = [
                detail for detail in pool_stats if detail["name"] == pool_1
            ][0]["stats"]
            pool_2_stats = [
                detail for detail in pool_stats if detail["name"] == pool_2
            ][0]["stats"]
        except KeyError:
            log.error(
                "No stats about the pools requested found on the cluster")
            return 1

        log.debug(f"Pool-1 stats: {pool_1_stats}")
        log.debug(f"Pool-2 stats: {pool_2_stats}")
        if pool_1_stats["compress_bytes_used"] < 0:
            log.error("No data stored under pool-1 is compressed")
            return 1

        if pool_1_stats["kb_used"] >= pool_2_stats["kb_used"]:
            log.error("Compression has no effect on the pool size...")
            return 1

        if config["Compression_tests"].get("verify_compression_ratio_set"):
            # added verification for test: CEPH-83571672
            if not rados_obj.check_compression_size(pool_name=pool_1,
                                                    **compression_config):
                log.error("data not compressed in accordance to ratio set")
                return 1

        log.info("Pool size is less when compression is enabled")
        return 0

    if config.get("check_autoscaler_profile"):
        """
        Verifies that the default auto-scaler profile on 5.1 builds in scale-up
        Verifies bugs : 1. https://bugzilla.redhat.com/show_bug.cgi?id=2021738
        """
        build = config.get("build", config.get("rhbuild"))
        autoscale_conf = config.get("check_autoscaler_profile")
        regex = r"5.1-rhel-\d{1}"
        if re.search(regex, build):
            log.info(
                "Test running on 5.1 builds, checking the default autoscaler profile"
            )
            if not mon_obj.verify_set_config(**autoscale_conf):
                log.error(
                    f"The default value for autoscaler profile is not scale-up in buld {build}"
                )
                return 1
            log.info(f"Autoscale profile is scale-up in release : {build}")
        else:
            log.debug(
                f"The profile is already scale-up by default in release : {build}"
            )
        return 0