Exemple #1
0
async def check_nodes_present_or_gone(
    connection_factory,
    old_replicas: int,
    new_replicas: int,
    node_prefix: str,
    logger: logging.Logger,
):
    """
    :param connection_factory: A callable that allows the operator to connect
        to the database. We regularly need to reconnect to ensure the
        connection wasn't closed because it was opened to a CrateDB node that
        was shut down since the connection was opened.
    :param old_replicas: The number of replicas in a StatefulSet before
        scaling.
    :param new_replicas: The number of replicas in a StatefulSet after scaling.
    :param node_prefix: The prefix of the node names in CrateDB.
    :raises: A :class:`kopf.TemporaryError` when nodes are missing (scale up)
        or still available (scale down).
    """
    full_node_list = [
        f"{node_prefix}-{i}" for i in range(max(old_replicas, new_replicas))
    ]
    async with connection_factory() as conn:
        async with conn.cursor() as cursor:
            await cursor.execute(
                """
                SELECT name FROM sys.nodes WHERE name = ANY(%s)
                """,
                (full_node_list,),
            )
            rows = await cursor.fetchall()
            available_nodes = {r[0] for r in rows} if rows else set()
            candidate_node_names = {
                f"{node_prefix}-{i}"
                for i in range(
                    min(old_replicas, new_replicas), max(old_replicas, new_replicas)
                )
            }
            if old_replicas < new_replicas:
                # scale up. Wait for missing nodes
                if not candidate_node_names.issubset(available_nodes):
                    missing_nodes = ", ".join(sorted(candidate_node_names))
                    raise kopf.TemporaryError(
                        f"Waiting for nodes {missing_nodes} to be present.", delay=15
                    )
            elif old_replicas > new_replicas:
                # scale down
                if candidate_node_names.issubset(available_nodes):
                    excess_nodes = ", ".join(sorted(candidate_node_names))
                    raise kopf.TemporaryError(
                        f"Waiting for nodes {excess_nodes} to be gone.", delay=15
                    )
            else:
                logger.info(
                    "No need to wait for nodes with prefix '%s', since the "
                    "number of replicas didn't change.",
                    node_prefix,
                )
async def restart_cluster(namespace: str, name: str, total_nodes: int,
                          logger: logging.Logger) -> None:
    """
    Perform a rolling restart of the CrateDB cluster ``name`` in ``namespace``.

    One node at a time, this function will terminate first the master nodes and
    then the data nodes in the cluster. After triggering a pod's termination,
    the operator will wait for that pod to be terminated and gone. It will then
    wait for the cluster to have the desired number of nodes again and for the
    cluster to be in a ``GREEN`` state.

    :param namespace: The Kubernetes namespace where to look up CrateDB cluster.
    :param name: The CrateDB custom resource name defining the CrateDB cluster.
    :param total_nodes: The total number of nodes that the cluster should
        consist of, per the CrateDB cluster spec.
    """
    coapi = CustomObjectsApi()
    core = CoreV1Api()

    cluster = await coapi.get_namespaced_custom_object(
        group=API_GROUP,
        version="v1",
        plural=RESOURCE_CRATEDB,
        namespace=namespace,
        name=name,
    )
    password = await get_system_user_password(namespace, name, core)
    host = await get_host(core, namespace, name)
    conn_factory = connection_factory(host, password)

    if "master" in cluster["spec"]["nodes"]:
        await restart_statefulset(core, conn_factory, namespace, name,
                                  "master", total_nodes, logger)
    for node_spec in cluster["spec"]["nodes"]["data"]:
        await restart_statefulset(core, conn_factory, namespace, name,
                                  node_spec["name"], total_nodes, logger)
Exemple #3
0
    async def _ensure_no_snapshots_in_progress(self, namespace, name, logger):
        async with ApiClient() as api_client:
            core = CoreV1Api(api_client)

            host = await get_host(core, namespace, name)
            password = await get_system_user_password(core, namespace, name)
            conn_factory = connection_factory(host, password)

            snapshots_in_progress, statement = await are_snapshots_in_progress(
                conn_factory, logger)
            if snapshots_in_progress:
                # Raising a TemporaryError will clear any registered subhandlers, so we
                # execute this one directly instead to make sure it runs.
                # The same guarantees about it being executed only once still stand.
                await kopf.execute(
                    fns={
                        "notify_backup_running":
                        subhandler_partial(self._notify_backup_running, logger)
                    })
                raise kopf.TemporaryError(
                    "A snapshot is currently in progress, "
                    f"waiting for it to finish: {statement}",
                    delay=30,
                )
Exemple #4
0
async def scale_cluster(
    apps: AppsV1Api,
    core: CoreV1Api,
    namespace: str,
    name: str,
    old: kopf.Body,
    master_diff_item: Optional[kopf.DiffItem],
    data_diff_items: Optional[kopf.Diff],
    logger: logging.Logger,
):
    """
    Scale cluster ``name`` according to the given ``master_diff_item`` and
    ``data_diff_items``.

    :param apps: An instance of the Kubernetes Apps V1 API.
    :param core: An instance of the Kubernetes Core V1 API.
    :param namespace: The Kubernetes namespace for the CrateDB cluster.
    :param name: The CrateDB custom resource name defining the CrateDB cluster.
    :param old: The old resource body.
    :param master_diff_item: An optional change indicating how many master
        nodes a cluster should have.
    :param data_diff_items: An optional list of changes made to the individual
        data node specifications.
    """
    spec = old["spec"]
    total_number_of_nodes = get_total_nodes_count(spec["nodes"])

    host = await get_host(core, namespace, name)
    password = await get_system_user_password(core, namespace, name)
    conn_factory = connection_factory(host, password)

    num_master_nodes = 0
    if "master" in spec["nodes"]:
        num_master_nodes = spec["nodes"]["master"]["replicas"]

    if master_diff_item:
        _, _, old_replicas, new_replicas = master_diff_item
        total_number_of_nodes = total_number_of_nodes + new_replicas - old_replicas
        num_master_nodes = new_replicas
        sts_name = f"crate-master-{name}"
        statefulset = await apps.read_namespaced_stateful_set(
            namespace=namespace, name=sts_name
        )
        current_replicas = statefulset.spec.replicas
        if current_replicas != new_replicas:
            await update_statefulset(
                apps,
                namespace,
                sts_name,
                statefulset,
                new_replicas,
                total_number_of_nodes,
            )
            await scale_cluster_patch_total_nodes(
                apps, namespace, name, spec, total_number_of_nodes
            )

        await check_nodes_present_or_gone(
            conn_factory,
            old_replicas,
            new_replicas,
            "master",
            logger,
        )

    if data_diff_items:
        for _, field_path, old_replicas, new_replicas in data_diff_items:
            if old_replicas < new_replicas:
                # scale up
                total_number_of_nodes = (
                    total_number_of_nodes + new_replicas - old_replicas
                )
                index, *_ = field_path
                index = int(index)
                node_spec = spec["nodes"]["data"][index]
                node_name = node_spec["name"]
                sts_name = f"crate-data-{node_name}-{name}"
                statefulset = await apps.read_namespaced_stateful_set(
                    namespace=namespace, name=sts_name
                )
                current_replicas = statefulset.spec.replicas
                if current_replicas != new_replicas:
                    await update_statefulset(
                        apps,
                        namespace,
                        sts_name,
                        statefulset,
                        new_replicas,
                        total_number_of_nodes,
                    )
                    await scale_cluster_patch_total_nodes(
                        apps, namespace, name, spec, total_number_of_nodes
                    )

                await check_nodes_present_or_gone(
                    conn_factory,
                    old_replicas,
                    new_replicas,
                    f"data-{node_name}",
                    logger,
                )

        for _, field_path, old_replicas, new_replicas in data_diff_items:
            if old_replicas > new_replicas:
                # scale down
                # First check if the cluster is healthy at all,
                # and prevent scaling down if not.
                await _ensure_cluster_healthy(
                    name, namespace, apps, conn_factory, logger
                )

                total_number_of_nodes = (
                    total_number_of_nodes + new_replicas - old_replicas
                )
                index, *_ = field_path
                index = int(index)
                node_spec = spec["nodes"]["data"][index]
                node_name = node_spec["name"]
                sts_name = f"crate-data-{node_name}-{name}"
                statefulset = await apps.read_namespaced_stateful_set(
                    namespace=namespace, name=sts_name
                )
                current_replicas = statefulset.spec.replicas
                if current_replicas != new_replicas:
                    excess_nodes = [
                        f"data-{node_name}-{i}"
                        for i in range(new_replicas, old_replicas)
                    ]
                    await deallocate_nodes(
                        conn_factory,
                        total_number_of_nodes - num_master_nodes,
                        excess_nodes,
                        logger,
                    )

                    await update_statefulset(
                        apps,
                        namespace,
                        sts_name,
                        statefulset,
                        new_replicas,
                        total_number_of_nodes,
                    )
                    await scale_cluster_patch_total_nodes(
                        apps, namespace, name, spec, total_number_of_nodes
                    )

                await check_nodes_present_or_gone(
                    conn_factory,
                    old_replicas,
                    new_replicas,
                    f"data-{node_name}",
                    logger,
                )

    # Reset the deallocation
    if "master" in spec["nodes"]:
        reset_pod_name = f"crate-master-{name}-0"
    else:
        reset_pod_name = f"crate-data-{spec['nodes']['data'][0]['name']}-{name}-0"
    await reset_allocation(namespace, reset_pod_name, "ssl" in spec["cluster"])

    # Acknowledge all node checks that state that the expected number of nodes
    # doesn't line up. The StatefulSets have been adjusted, and once a pod
    # restarts, the node will know about it.
    async with conn_factory() as conn:
        async with conn.cursor() as cursor:
            await cursor.execute(
                """
                UPDATE sys.node_checks SET acknowledged = TRUE WHERE id = 1
                """
            )
async def test_scale_cluster(
    repl_master_from,
    repl_master_to,
    repl_hot_from,
    repl_hot_to,
    repl_cold_from,
    repl_cold_to,
    faker,
    namespace,
    cleanup_handler,
    cratedb_crd,
    kopf_runner,
):
    coapi = CustomObjectsApi()
    core = CoreV1Api()
    name = faker.domain_word()

    # Clean up persistent volume after the test
    cleanup_handler.append(
        core.delete_persistent_volume(
            name=f"temp-pv-{namespace.metadata.name}-{name}"))
    body = {
        "apiVersion": "cloud.crate.io/v1",
        "kind": "CrateDB",
        "metadata": {
            "name": name
        },
        "spec": {
            "cluster": {
                "imageRegistry": "crate",
                "name": "my-crate-cluster",
                "version": "4.1.5",
            },
            "nodes": {
                "data": []
            },
        },
    }
    if repl_master_from:
        body["spec"]["nodes"]["master"] = {
            "replicas": repl_master_from,
            "resources": {
                "cpus": 0.5,
                "memory": "1Gi",
                "heapRatio": 0.25,
                "disk": {
                    "storageClass": "default",
                    "size": "16GiB",
                    "count": 1
                },
            },
        }
    body["spec"]["nodes"]["data"].append(
        {
            "name": "hot",
            "replicas": repl_hot_from,
            "resources": {
                "cpus": 0.5,
                "memory": "1Gi",
                "heapRatio": 0.25,
                "disk": {
                    "storageClass": "default",
                    "size": "16GiB",
                    "count": 1
                },
            },
        }, )
    if repl_cold_from:
        body["spec"]["nodes"]["data"].append(
            {
                "name": "cold",
                "replicas": repl_cold_from,
                "resources": {
                    "cpus": 0.5,
                    "memory": "1Gi",
                    "heapRatio": 0.25,
                    "disk": {
                        "storageClass": "default",
                        "size": "16GiB",
                        "count": 1
                    },
                },
            }, )
    await coapi.create_namespaced_custom_object(
        group=API_GROUP,
        version="v1",
        plural=RESOURCE_CRATEDB,
        namespace=namespace.metadata.name,
        body=body,
    )

    host = await asyncio.wait_for(
        get_public_host(core, namespace.metadata.name, name),
        timeout=BACKOFF_TIME *
        5,  # It takes a while to retrieve an external IP on AKS.
    )
    password = await get_system_user_password(namespace.metadata.name, name,
                                              core)

    await assert_wait_for(
        True,
        is_cluster_healthy,
        connection_factory(host, password),
        repl_master_from + repl_hot_from + repl_cold_from,
        err_msg="Cluster wasn't healthy after 5 minutes.",
        timeout=BACKOFF_TIME * 5,
    )

    patch_body = []
    if repl_master_from != repl_master_to:
        patch_body.append({
            "op": "replace",
            "path": "/spec/nodes/master/replicas",
            "value": repl_master_to,
        })
    if repl_hot_from != repl_hot_to:
        patch_body.append({
            "op": "replace",
            "path": "/spec/nodes/data/0/replicas",
            "value": repl_hot_to,
        })
    if repl_cold_from != repl_cold_to:
        patch_body.append({
            "op": "replace",
            "path": "/spec/nodes/data/1/replicas",
            "value": repl_cold_to,
        })
    await coapi.patch_namespaced_custom_object(
        group=API_GROUP,
        version="v1",
        plural=RESOURCE_CRATEDB,
        namespace=namespace.metadata.name,
        name=name,
        body=patch_body,
    )

    await assert_wait_for(
        True,
        is_cluster_healthy,
        connection_factory(host, password),
        repl_master_to + repl_hot_to + repl_cold_to,
        err_msg="Cluster wasn't healthy after 5 minutes.",
        timeout=BACKOFF_TIME * 5,
    )
Exemple #6
0
async def test_restart_cluster(faker, namespace, cleanup_handler, cratedb_crd,
                               kopf_runner):
    coapi = CustomObjectsApi()
    core = CoreV1Api()
    name = faker.domain_word()

    # Clean up persistent volume after the test
    cleanup_handler.append(
        core.delete_persistent_volume(
            name=f"temp-pv-{namespace.metadata.name}-{name}"))
    await coapi.create_namespaced_custom_object(
        group=API_GROUP,
        version="v1",
        plural=RESOURCE_CRATEDB,
        namespace=namespace.metadata.name,
        body={
            "apiVersion": "cloud.crate.io/v1",
            "kind": "CrateDB",
            "metadata": {
                "name": name
            },
            "spec": {
                "cluster": {
                    "imageRegistry": "crate",
                    "name": "my-crate-cluster",
                    "version": "4.1.5",
                },
                "nodes": {
                    "data": [
                        {
                            "name": "hot",
                            "replicas": 1,
                            "resources": {
                                "cpus": 0.5,
                                "memory": "1Gi",
                                "heapRatio": 0.25,
                                "disk": {
                                    "storageClass": "default",
                                    "size": "16GiB",
                                    "count": 1,
                                },
                            },
                        },
                        {
                            "name": "cold",
                            "replicas": 2,
                            "resources": {
                                "cpus": 0.5,
                                "memory": "1Gi",
                                "heapRatio": 0.25,
                                "disk": {
                                    "storageClass": "default",
                                    "size": "16GiB",
                                    "count": 1,
                                },
                            },
                        },
                    ],
                },
            },
        },
    )

    host = await asyncio.wait_for(
        get_public_host(core, namespace.metadata.name, name),
        timeout=BACKOFF_TIME *
        5,  # It takes a while to retrieve an external IP on AKS.
    )

    password = await get_system_user_password(namespace.metadata.name, name,
                                              core)

    await assert_wait_for(
        True,
        do_pods_exist,
        core,
        namespace.metadata.name,
        {
            f"crate-data-hot-{name}-0",
            f"crate-data-cold-{name}-0",
            f"crate-data-cold-{name}-1",
        },
    )

    await assert_wait_for(
        True,
        is_cluster_healthy,
        connection_factory(host, password),
        err_msg="Cluster wasn't healthy after 5 minutes.",
        timeout=BACKOFF_TIME * 5,
    )

    pods = await core.list_namespaced_pod(namespace=namespace.metadata.name)
    original_pods = {p.metadata.uid for p in pods.items}

    await asyncio.wait_for(
        restart_cluster(namespace.metadata.name, name, 3,
                        logging.getLogger(__name__)),
        BACKOFF_TIME * 15,
    )

    pods = await core.list_namespaced_pod(namespace=namespace.metadata.name)
    new_pods = {p.metadata.uid for p in pods.items}

    assert original_pods.intersection(new_pods) == set()
Exemple #7
0
async def restart_cluster(
    core: CoreV1Api,
    namespace: str,
    name: str,
    old: kopf.Body,
    logger: logging.Logger,
    patch: kopf.Patch,
    status: kopf.Status,
) -> None:
    """
    Perform a rolling restart of the CrateDB cluster ``name`` in ``namespace``.

    One node at a time, this function will terminate first the master nodes and
    then the data nodes in the cluster. After triggering a pod's termination,
    the operator will wait for that pod to be terminated and gone. It will then
    wait for the cluster to have the desired number of nodes again and for the
    cluster to be in a ``GREEN`` state, before terminating the next pod.

    :param core: An instance of the Kubernetes Core V1 API.
    :param namespace: The Kubernetes namespace where to look up CrateDB cluster.
    :param name: The CrateDB custom resource name defining the CrateDB cluster.
    :param old: The old resource body.
    """
    pending_pods: List[Dict[str, str]] = status.get("pendingPods") or []
    if not pending_pods:
        if "master" in old["spec"]["nodes"]:
            pending_pods.extend(await get_pods_in_statefulset(
                core, namespace, name, "master"))
        for node_spec in old["spec"]["nodes"]["data"]:
            pending_pods.extend(await get_pods_in_statefulset(
                core, namespace, name, node_spec["name"]))
        patch.status["pendingPods"] = pending_pods

    if not pending_pods:
        # We're all done
        patch.status[
            "pendingPods"] = None  # Remove attribute from status stanza
        return

    next_pod_uid = pending_pods[0]["uid"]
    next_pod_name = pending_pods[0]["name"]

    all_pod_uids, all_pod_names = await get_pods_in_cluster(
        core, namespace, name)
    if next_pod_uid in all_pod_uids:
        # The next to-be-terminated pod still appears to be running.
        logger.info("Terminating pod '%s'", next_pod_name)
        # Trigger deletion of Pod.
        # This may take a while as it tries to gracefully stop the containers
        # of the Pod.
        await core.delete_namespaced_pod(namespace=namespace,
                                         name=next_pod_name)
        raise kopf.TemporaryError(
            f"Waiting for pod {next_pod_name} ({next_pod_uid}) to be terminated.",
            delay=15,
        )
    elif next_pod_name in all_pod_names:
        total_nodes = get_total_nodes_count(old["spec"]["nodes"])
        # The new pod has been spawned. Only a matter of time until it's ready.
        password, host = await asyncio.gather(
            get_system_user_password(core, namespace, name),
            get_host(core, namespace, name),
        )
        conn_factory = connection_factory(host, password)
        if await is_cluster_healthy(conn_factory, total_nodes, logger):
            pending_pods.pop(0)  # remove the first item in the list

            if pending_pods:
                patch.status["pendingPods"] = pending_pods

                raise kopf.TemporaryError(
                    "Scheduling rerun because there are pods to be restarted",
                    delay=5)
            else:
                # We're all done
                patch.status[
                    "pendingPods"] = None  # Remove attribute from `.status`
                return
        else:
            raise kopf.TemporaryError("Cluster is not healthy yet.", delay=30)
    else:
        raise kopf.TemporaryError(
            "Scheduling rerun because there are pods to be restarted",
            delay=15)
async def restart_statefulset(
    core: CoreV1Api,
    connection_factory: Callable[[], Connection],
    namespace: str,
    name: str,
    node_name: str,
    total_nodes: int,
    logger: logging.Logger,
) -> None:
    """
    Perform a rolling restart of the nodes in the Kubernetes StatefulSet
    ``name`` in ``namespace``.

    :param core: An instance of the Kubernetes Core V1 API.
    :param connection_factory: A function establishes a connection to the
        CrateDB cluster to be used to SQL queries checking for health, etc.
    :param namespace: The Kubernetes namespace where to look up CrateDB cluster.
    :param name: The CrateDB custom resource name defining the CrateDB cluster.
    :param node_name: Either ``"master"`` for dedicated master nodes, or the
        ``name`` for a data node spec. Used to determine which StatefulSet to
        of the cluster should be "restarted".
    :param total_nodes: The total number of nodes that the cluster should
        consist of, per the CrateDB cluster spec.
    """
    async with connection_factory() as conn:
        async with conn.cursor() as cursor:
            healthiness = await get_healthiness(cursor)
            if healthiness not in {1, None}:
                raise ValueError("Unhealthy cluster")

    labels = {
        LABEL_COMPONENT: "cratedb",
        LABEL_MANAGED_BY: "crate-operator",
        LABEL_NAME: name,
        LABEL_NODE_NAME: node_name,
        LABEL_PART_OF: "cratedb",
    }

    get_pods = functools.partial(
        core.list_namespaced_pod,
        namespace=namespace,
        label_selector=",".join(f"{k}={v}" for k, v in labels.items()),
    )

    pods = await get_pods()
    for pod in pods.items:
        logger.info("Terminating pod '%s'", pod.metadata.name)
        # Trigger deletion of Pod.
        # This may take a while as it tries to gracefully stop the containers
        # of the Pod.
        await core.delete_namespaced_pod(namespace=namespace,
                                         name=pod.metadata.name)

        # Waiting for the pod to go down. This ensures we won't try to connect
        # to the killed pod through the load balancing service.
        await wait_for_termination(pod, get_pods, logger)

        # Once the Crate node is terminated, we can start checking the health
        # of the cluster.
        await wait_for_healthy_cluster(connection_factory, total_nodes, logger)
        logger.info("Cluster has recovered. Moving on ...")
Exemple #9
0
async def start_cluster(
    name: str,
    namespace: V1Namespace,
    cleanup_handler,
    core: CoreV1Api,
    coapi: CustomObjectsApi,
    hot_nodes: int = 0,
    crate_version: str = CRATE_VERSION,
) -> Tuple[str, str]:
    # Clean up persistent volume after the test
    cleanup_handler.append(
        core.delete_persistent_volume(
            name=f"temp-pv-{namespace.metadata.name}-{name}"))
    body = {
        "apiVersion": "cloud.crate.io/v1",
        "kind": "CrateDB",
        "metadata": {
            "name": name
        },
        "spec": {
            "cluster": {
                "imageRegistry": "crate",
                "name": "my-crate-cluster",
                "version": crate_version,
            },
            "nodes": {
                "data": [
                    {
                        "name": "hot",
                        "replicas": hot_nodes,
                        "resources": {
                            "cpus": 0.5,
                            "memory": "1Gi",
                            "heapRatio": 0.25,
                            "disk": {
                                "storageClass": "default",
                                "size": "16GiB",
                                "count": 1,
                            },
                        },
                    },
                ]
            },
        },
    }
    await coapi.create_namespaced_custom_object(
        group=API_GROUP,
        version="v1",
        plural=RESOURCE_CRATEDB,
        namespace=namespace.metadata.name,
        body=body,
    )

    host = await asyncio.wait_for(
        get_public_host(core, namespace.metadata.name, name),
        # It takes a while to retrieve an external IP on AKS.
        timeout=DEFAULT_TIMEOUT * 5,
    )
    password = await get_system_user_password(core, namespace.metadata.name,
                                              name)

    await assert_wait_for(
        True,
        is_cluster_healthy,
        connection_factory(host, password),
        hot_nodes,
        err_msg="Cluster wasn't healthy after 5 minutes.",
        timeout=DEFAULT_TIMEOUT * 5,
    )

    return host, password
Exemple #10
0
async def scale_cluster(
    apps: AppsV1Api,
    namespace: str,
    name: str,
    do_scale_data: bool,
    do_scale_master: bool,
    old_total_nodes: int,
    spec: kopf.Spec,
    master_diff_item: Optional[kopf.DiffItem],
    data_diff_items: Optional[kopf.Diff],
    logger: logging.Logger,
):
    """
    Scale cluster ``name`` according to the given ``master_diff_item`` and
    ``data_diff_items``.

    :param apps: An instance of the Kubernetes Apps V1 API.
    :param namespace: The Kubernetes namespace for the CrateDB cluster.
    :param name: The CrateDB custom resource name defining the CrateDB cluster.
    :param do_scale_data: ``True``, if data nodes need to be scaled.
    :param do_scale_master: ``True``, if master nodes need to be scaled.
    :param old_total_nodes: The total number of nodes in the CrateDB cluster
        *before* scaling the StatefulSet.
    :param spec: The ``spec`` field from the new CrateDB cluster custom object.
    :param master_diff_item: An optional change indicating how many master
        nodes a cluster should have.
    :param data_diff_items: An optional list of changes made to the individual
        data node specifications.
    """
    core = CoreV1Api()

    host = await get_host(core, namespace, name)
    password = await get_system_user_password(namespace, name)
    conn_factory = connection_factory(host, password)

    total_nodes = old_total_nodes
    if do_scale_master:
        total_nodes = await scale_cluster_master_nodes(
            apps,
            namespace,
            name,
            spec,
            master_diff_item,
            conn_factory,
            total_nodes,
            logger,
        )

    if do_scale_data:
        total_nodes = await scale_cluster_data_nodes(
            apps,
            namespace,
            name,
            spec,
            data_diff_items,
            conn_factory,
            total_nodes,
            logger,
        )

    await scale_cluster_patch_total_nodes(apps, namespace, name, spec, total_nodes)

    # Acknowledge all node checks that state that the expected number of nodes
    # doesn't line up. The StatefulSets have been adjusted, and once a pod
    # restarts, the node will know about it.
    async with conn_factory() as conn:
        async with conn.cursor() as cursor:
            await cursor.execute(
                """UPDATE sys.node_checks SET acknowledged = TRUE WHERE id = 1"""
            )
async def test_upgrade_cluster(
    faker, namespace, cleanup_handler, kopf_runner, api_client
):
    version_from = "4.4.1"
    version_to = "4.4.2"
    coapi = CustomObjectsApi(api_client)
    core = CoreV1Api(api_client)
    name = faker.domain_word()

    host, password = await start_cluster(
        name, namespace, cleanup_handler, core, coapi, 3, version_from
    )

    await assert_wait_for(
        True,
        do_pods_exist,
        core,
        namespace.metadata.name,
        {
            f"crate-data-hot-{name}-0",
            f"crate-data-hot-{name}-1",
            f"crate-data-hot-{name}-2",
        },
    )

    conn_factory = connection_factory(host, password)

    await assert_wait_for(
        True,
        is_cluster_healthy,
        conn_factory,
        3,
        err_msg="Cluster wasn't healthy",
        timeout=DEFAULT_TIMEOUT,
    )

    await create_test_sys_jobs_table(conn_factory)

    pods = await core.list_namespaced_pod(namespace=namespace.metadata.name)
    original_pods = {p.metadata.uid for p in pods.items}

    await coapi.patch_namespaced_custom_object(
        group=API_GROUP,
        version="v1",
        plural=RESOURCE_CRATEDB,
        namespace=namespace.metadata.name,
        name=name,
        body=[
            {
                "op": "replace",
                "path": "/spec/cluster/version",
                "value": version_to,
            },
        ],
    )

    await assert_wait_for(
        False,
        do_pod_ids_exist,
        core,
        namespace.metadata.name,
        original_pods,
        timeout=DEFAULT_TIMEOUT * 15,
    )

    await assert_wait_for(
        True,
        is_kopf_handler_finished,
        coapi,
        name,
        namespace.metadata.name,
        "operator.cloud.crate.io/cluster_update.upgrade",
        err_msg="Upgrade has not finished",
        timeout=DEFAULT_TIMEOUT * 5,
    )

    await assert_wait_for(
        True,
        is_kopf_handler_finished,
        coapi,
        name,
        namespace.metadata.name,
        "operator.cloud.crate.io/cluster_update.restart",
        err_msg="Restart has not finished",
        timeout=DEFAULT_TIMEOUT,
    )

    await assert_wait_for(
        True,
        is_cluster_healthy,
        connection_factory(host, password),
        3,
        err_msg="Cluster wasn't healthy",
        timeout=DEFAULT_TIMEOUT,
    )