Esempio n. 1
0
def set_scylla_sysctl_value(db_cluster: ScyllaPodCluster, sysctl_name,
                            sysctl_value: str) -> None:
    sysctls = db_cluster.get_scylla_cluster_plain_value('/spec/sysctls')
    sysctl_to_set = f"{sysctl_name}={sysctl_value}"
    for i, _ in enumerate(sysctls):
        if sysctls[i].startswith(f"{sysctl_name}="):
            sysctls[i] = sysctl_to_set
            break
    else:
        sysctls.append(sysctl_to_set)
    db_cluster.replace_scylla_cluster_value("/spec/sysctls", sysctls)
Esempio n. 2
0
def _bring_cluster_back_to_original_state(db_cluster: ScyllaPodCluster,
                                          config_map: dict,
                                          original_scylla_cluster_spec: dict):
    restart = False
    try:
        # Restore cluster spec, there is one problem though:
        #  s-o does not support rack removal, so if we see an extra rack we need to remove members form it
        #  but keep it in the cluster spec
        # Other alternative is to redeploy cluster, which is expensive and we will do so only if it is found needed

        original_rack_specs = original_scylla_cluster_spec.get(
            'datacenter', {}).get('racks', [])
        current_cluster_spec = db_cluster.get_scylla_cluster_plain_value(
            '/spec')
        current_rack_specs = current_cluster_spec.get('datacenter',
                                                      {}).get('racks', [])
        if len(original_rack_specs) < len(current_rack_specs):
            # A new racks with 0 members in them to the original cluster specification
            # At this point original_rack_specs is more like cluster spec we want to have
            new_racks = current_rack_specs[len(original_rack_specs):]
            for rack in new_racks:
                rack['members'] = 0
            original_rack_specs.extend(new_racks)

        # NOTE: ignore 'forceRedeploymentReason' field always to avoid redundant restarts
        original_scylla_cluster_spec.pop("forceRedeploymentReason", None)
        current_cluster_spec.pop("forceRedeploymentReason", None)
        if original_scylla_cluster_spec != current_cluster_spec:
            # If cluster spec we currently have is not equal to what we want replace it and
            #  remember to restart the cluster afterwards
            db_cluster.replace_scylla_cluster_value(
                '/spec', original_scylla_cluster_spec)
            restart = True

        # Restore config-map scylla-config
        with db_cluster.scylla_config_map as recover_config_map:
            if recover_config_map != config_map:
                # if config map is changed scylla will be restarted therefore we don't have to explicitly restart it
                restart = False
            recover_config_map.clear()
            recover_config_map.update(config_map)
        if restart:
            db_cluster.restart_scylla()
    except Exception as exc:  # pylint: disable=broad-except
        tester.healthy_flag = False
        pytest.fail(
            "Failed to bring cluster nodes back to original number due to :\n"
            + "".join(
                traceback.format_exception(type(exc), exc, exc.__traceback__)))
Esempio n. 3
0
def get_scylla_sysctl_value(db_cluster: ScyllaPodCluster,
                            sysctl_name: str) -> int:
    sysctls = db_cluster.get_scylla_cluster_plain_value('/spec/sysctls')
    for sysctl in sysctls:
        if sysctl.startswith(f"{sysctl_name}="):
            return int(sysctl.split("=")[-1])
    raise ValueError(f"Cannot find '{sysctl_name}' sysctl")
def test_ha_update_spec_while_rollout_restart(db_cluster: ScyllaPodCluster):
    """
    Cover the issue https://github.com/scylladb/scylla-operator/issues/410
    Validate that cluster resources can be updated while the scylla-operator is rolling out.
    - update cluster specification a few time
    - start rollout restart in parallel with the update
    - validate that the cluster specification has been updated
    """
    terminate_change_spec_thread = threading.Event()
    value = 1048576
    crd_update_errors = []

    def change_cluster_spec():
        nonlocal value
        nonlocal crd_update_errors
        while not terminate_change_spec_thread.wait(0.1):
            try:
                db_cluster.replace_scylla_cluster_value(
                    '/spec/sysctls', [f"fs.aio-max-nr={value + 1}"])
                # increase the value just when the sysctls spec value has been updated - to prevent the situation when
                # value was increased, but sysctls spec value was not updated
                value += 1
            except Exception as error:  # pylint: disable=broad-except
                log.debug("Change /spec/sysctls value to %d failed. Error: %s",
                          value, str(error))
                crd_update_errors.append(str(error))

    change_cluster_spec_thread = threading.Thread(target=change_cluster_spec,
                                                  daemon=True)
    log.info("Start update cluster specification")
    change_cluster_spec_thread.start()

    log.info("Start rollout restart")
    scylla_operator_rollout_restart(db_cluster)
    operator_rollout_errors = wait_for_scylla_operator_rollout_complete(
        db_cluster)
    assert not operator_rollout_errors, "Rollout restart failed. Reasons: {}".format(
        '\n'.join(operator_rollout_errors))

    log.info("Stop update cluster specification")
    terminate_change_spec_thread.set()
    change_cluster_spec_thread.join()

    assert not crd_update_errors, \
        "Found following errors during rollout restart: {}".format("\n".join(crd_update_errors))

    sysctl_value = db_cluster.get_scylla_cluster_plain_value('/spec/sysctls')
    expected_sysctl_value = [f"fs.aio-max-nr={value}"]
    assert expected_sysctl_value == sysctl_value, \
        f"Cluster specification has not been updated. Expected {expected_sysctl_value}, actual {sysctl_value}"
def _bring_cluster_back_to_original_state(
        db_cluster: ScyllaPodCluster,
        config_map: dict,
        original_scylla_cluster_spec: dict
):
    restart = False
    try:
        # Restore cluster spec, there is one problem though:
        #  s-o does not support rack removal, so if we see an extra rack we need to remove members form it
        #  but keep it in the cluster spec
        # Other alternative is to redeploy cluster, which is expensive and we will do so only if it is found needed

        original_rack_specs = original_scylla_cluster_spec.get('datacenter', {}).get('racks', [])
        current_cluster_spec = db_cluster.get_scylla_cluster_plain_value('/spec')
        current_rack_specs = current_cluster_spec.get('datacenter', {}).get('racks', [])
        if len(original_rack_specs) < len(current_rack_specs):
            # A new racks with 0 members in them to the original cluster specification
            # At this point original_rack_specs is more like cluster spec we want to have
            new_racks = current_rack_specs[len(original_rack_specs):]
            for rack in new_racks:
                rack['members'] = 0
            original_rack_specs.extend(new_racks)

        # Restore config-map scylla-config
        with db_cluster.scylla_config_map as recover_config_map:
            if recover_config_map != config_map:
                recover_config_map.clear()
                recover_config_map.update(config_map)
                restart = True

        # NOTE: ignore 'forceRedeploymentReason' field always to avoid redundant restarts
        original_scylla_cluster_spec.pop("forceRedeploymentReason", None)
        current_cluster_spec.pop("forceRedeploymentReason", None)
        if original_scylla_cluster_spec != current_cluster_spec:
            # If cluster spec we currently have is not equal to what we want replace it.
            # It will cause scylla pods rollout restart on the operator level.
            # WARNING: if number of nodes differs than we will have incorrect data
            #          in "db_cluster.nodes". For the moment all changes to node number must
            #          go though 'add_nodes' and 'decommision' methods only.
            db_cluster.replace_scylla_cluster_value('/spec', original_scylla_cluster_spec)
            db_cluster.wait_sts_rollout_restart(len(db_cluster.nodes))
            restart = False

        if restart:
            db_cluster.restart_scylla()
    except Exception as exc:  # pylint: disable=broad-except
        tester.healthy_flag = False
        pytest.fail("Failed to bring cluster nodes back to original number due to :\n" +
                    "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)))