def _monitor_k8s_job(
    status: kopf.Status,
    logger: kopf.Logger,
    **_: Any,
) -> bool:
    if (status.get("create_job", "")).startswith("Job"):
        if status.get("orbitJobOperator",
                      {}).get("jobStatus", None) in ["Complete", "Failed"]:
            return False
        else:
            return True
    else:
        return False
Exemple #2
0
def codebuild_monitor(status: kopf.Status, patch: kopf.Patch,
                      logger: kopf.Logger, **_: Any) -> str:
    replication = status.get("replication", {})

    build_id = replication.get("codeBuildId", None)

    client = boto3.client("codebuild")
    build = client.batch_get_builds(ids=[build_id])["builds"][0]
    replication["codeBuildStatus"] = build["buildStatus"]
    replication["codeBuildPhase"] = build["currentPhase"]

    if replication["codeBuildStatus"] not in "IN_PROGRESS":
        logger.info("CodeBuildId: %s BuildStatus: %s", build_id,
                    replication["codeBuildStatus"])
        with LOCK:
            global WORKERS_IN_PROCESS
            WORKERS_IN_PROCESS -= 1
        codebuild_attempts = replication.get("codeBuildAttempts", [])
        codebuild_attempts.append({
            "codeBuildId": build_id,
            "codeBuildStatus": build["buildStatus"],
            "codeBuildPhase": build["currentPhase"],
        })
        replication["codeBuildAttempts"] = codebuild_attempts
        replication["replicationStatus"] = "Complete" if build[
            "buildStatus"] == "SUCCEEDED" else "Failed"

    if replication["replicationStatus"] == "Failed":
        replication["failureDelay"] = 30

    patch["status"] = {"replication": replication}
    return cast(str, replication["codeBuildStatus"])
Exemple #3
0
def codebuild_runner(
    spec: kopf.Spec,
    patch: kopf.Patch,
    status: kopf.Status,
    logger: kopf.Logger,
    **_: Any,
) -> str:
    replication = status.get("replication", {})

    build_id, error = imagereplication_utils.replicate_image(
        src=spec["source"], dest=spec["destination"], config=CONFIG)

    replication["replicationStatus"] = "Replicating"
    replication["codeBuildId"] = build_id

    if error:
        replication["replicationStatus"] = "Failed"
        replication["failureDelay"] = 30
        with LOCK:
            global WORKERS_IN_PROCESS
            WORKERS_IN_PROCESS -= 1

    patch["status"] = {"replication": replication}
    if error:
        logger.error("CodeBuildId: %s Error: %s", build_id, error)
    else:
        logger.info("CodeBuildId: %s Error: %s", build_id, error)

    return cast(str, replication["replicationStatus"])
Exemple #4
0
def scheduler(status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger,
              **_: Any) -> str:
    replication = status.get("replication", {})
    replication["codeBuildStatus"] = None
    replication["codeBuildPhase"] = None
    replication["codeBuildId"] = None

    attempt = replication.get("attempt", 0) + 1
    if attempt > CONFIG["max_replication_attempts"]:
        replication["replicationStatus"] = "MaxAttemptsExceeded"
        replication["attempt"] = attempt

        patch["status"] = {"replication": replication}
    else:
        with LOCK:
            global WORKERS_IN_PROCESS
            logger.debug("WORKERS_IN_PROCESS: %s", WORKERS_IN_PROCESS)
            if WORKERS_IN_PROCESS < CONFIG["workers"]:
                WORKERS_IN_PROCESS += 1
                replication["replicationStatus"] = "Scheduled"
                replication["attempt"] = attempt

                patch["status"] = {"replication": replication}
                logger.info("Schedule Attempt: %s", replication["attempt"])

    return cast(str, replication["replicationStatus"])
Exemple #5
0
def _needs_rescheduling(status: kopf.Status, **_: Any) -> bool:
    replication = status.get("replication", None)
    if replication:
        replication_status = replication.get("replicationStatus", None)
        attempt = replication.get("attempt", 0)
        return cast(
            bool,
            replication_status == "Failed"
            and attempt <= CONFIG["max_replication_attempts"],
        )
    else:
        return False
def imagereplications_idx(namespace: str, name: str, spec: kopf.Spec,
                          status: kopf.Status, **_: Any) -> Dict[str, Any]:
    replication_status = status.get("replication",
                                    {}).get("replicationStatus", None)
    return {
        spec["destination"]: {
            "namespace": namespace,
            "name": name,
            "source": spec["source"],
            "replicationStatus": replication_status,
        }
    }
Exemple #7
0
    async def handle(  # type: ignore
        self,
        namespace: str,
        name: str,
        body: kopf.Body,
        old: kopf.Body,
        logger: logging.Logger,
        status: kopf.Status,
        **kwargs: Any,
    ):
        disabler_job_status = None
        for key in status.keys():
            if key.endswith(DISABLE_CRONJOB_HANDLER_ID):
                disabler_job_status = status.get(key)
                break

        if disabler_job_status is None:
            logger.info(
                "No cronjob was disabled, so can't re-enable anything.")
            return

        if disabler_job_status.get(IGNORE_CRONJOB, False):
            logger.warning("Will not attempt to re-enable any CronJobs")
            return

        async with ApiClient() as api_client:
            job_name = disabler_job_status[CRONJOB_NAME]

            batch = BatchV1beta1Api(api_client)

            jobs: V1beta1CronJobList = await batch.list_namespaced_cron_job(
                namespace)

            for job in jobs.items:
                if job.metadata.name == job_name:
                    update = {"spec": {"suspend": False}}
                    await batch.patch_namespaced_cron_job(
                        job_name, namespace, update)
                    logger.info(f"Re-enabled cronjob {job_name}")
Exemple #8
0
def rescheduler(status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger,
                **_: Any) -> str:
    logger.debug("Rescheduling")
    replication = status.get("replication", {})
    failure_delay = replication.get("failureDelay", 0)

    if failure_delay > 0:
        replication["failureDelay"] = failure_delay - 5
    else:
        replication["replicationStatus"] = "Pending"
        replication["failureDelay"] = None

    patch["status"] = {"replication": replication}
    return "Rescheduled"
Exemple #9
0
def replication_checker(
    spec: kopf.Spec,
    status: kopf.Status,
    patch: kopf.Patch,
    logger: kopf.Logger,
    **_: Any,
) -> str:
    if status.get("replication", None) is not None:
        return cast(str, status["replication"].get("replicationStatus",
                                                   "Unknown"))

    replication = {}
    if imagereplication_utils.image_replicated(image=spec["destination"],
                                               logger=logger):
        logger.info("Skipped: Image previously replicated to ECR")
        replication["replicationStatus"] = "ECRImageExists"
    else:
        logger.info("Starting Replication")
        replication["replicationStatus"] = "Pending"

    patch["status"] = {"replication": replication}
    return replication["replicationStatus"]
Exemple #10
0
async def restart_cluster(
    core: CoreV1Api,
    namespace: str,
    name: str,
    old: kopf.Body,
    logger: logging.Logger,
    patch: kopf.Patch,
    status: kopf.Status,
) -> None:
    """
    Perform a rolling restart of the CrateDB cluster ``name`` in ``namespace``.

    One node at a time, this function will terminate first the master nodes and
    then the data nodes in the cluster. After triggering a pod's termination,
    the operator will wait for that pod to be terminated and gone. It will then
    wait for the cluster to have the desired number of nodes again and for the
    cluster to be in a ``GREEN`` state, before terminating the next pod.

    :param core: An instance of the Kubernetes Core V1 API.
    :param namespace: The Kubernetes namespace where to look up CrateDB cluster.
    :param name: The CrateDB custom resource name defining the CrateDB cluster.
    :param old: The old resource body.
    """
    pending_pods: List[Dict[str, str]] = status.get("pendingPods") or []
    if not pending_pods:
        if "master" in old["spec"]["nodes"]:
            pending_pods.extend(await get_pods_in_statefulset(
                core, namespace, name, "master"))
        for node_spec in old["spec"]["nodes"]["data"]:
            pending_pods.extend(await get_pods_in_statefulset(
                core, namespace, name, node_spec["name"]))
        patch.status["pendingPods"] = pending_pods

    if not pending_pods:
        # We're all done
        patch.status[
            "pendingPods"] = None  # Remove attribute from status stanza
        return

    next_pod_uid = pending_pods[0]["uid"]
    next_pod_name = pending_pods[0]["name"]

    all_pod_uids, all_pod_names = await get_pods_in_cluster(
        core, namespace, name)
    if next_pod_uid in all_pod_uids:
        # The next to-be-terminated pod still appears to be running.
        logger.info("Terminating pod '%s'", next_pod_name)
        # Trigger deletion of Pod.
        # This may take a while as it tries to gracefully stop the containers
        # of the Pod.
        await core.delete_namespaced_pod(namespace=namespace,
                                         name=next_pod_name)
        raise kopf.TemporaryError(
            f"Waiting for pod {next_pod_name} ({next_pod_uid}) to be terminated.",
            delay=15,
        )
    elif next_pod_name in all_pod_names:
        total_nodes = get_total_nodes_count(old["spec"]["nodes"])
        # The new pod has been spawned. Only a matter of time until it's ready.
        password, host = await asyncio.gather(
            get_system_user_password(core, namespace, name),
            get_host(core, namespace, name),
        )
        conn_factory = connection_factory(host, password)
        if await is_cluster_healthy(conn_factory, total_nodes, logger):
            pending_pods.pop(0)  # remove the first item in the list

            if pending_pods:
                patch.status["pendingPods"] = pending_pods

                raise kopf.TemporaryError(
                    "Scheduling rerun because there are pods to be restarted",
                    delay=5)
            else:
                # We're all done
                patch.status[
                    "pendingPods"] = None  # Remove attribute from `.status`
                return
        else:
            raise kopf.TemporaryError("Cluster is not healthy yet.", delay=30)
    else:
        raise kopf.TemporaryError(
            "Scheduling rerun because there are pods to be restarted",
            delay=15)
def orbit_cron_job_monitor(
    namespace: str,
    name: str,
    patch: kopf.Patch,
    status: kopf.Status,
    logger: kopf.Logger,
    namespaces_idx: kopf.Index[str, Dict[str, Any]],
    cron_jobs_idx: kopf.Index[Tuple[str, str], Dict[str, Any]],
    **_: Any,
) -> Any:
    ns: Optional[Dict[str, Any]] = None
    k8s_job: Optional[Dict[str, Any]] = None

    for ns in namespaces_idx.get(namespace, []):
        logger.debug("ns: %s", ns)

    if ns is None:
        patch["status"] = {
            "orbitJobOperator": {
                "jobStatus": "JobDetailsNotFound",
                "error": "No Namespace resource found"
            }
        }
        return "JobDetailsNotFound"

    logger.debug("cron_jobs_idx: %s", cron_jobs_idx)
    for k8s_job in cron_jobs_idx.get((namespace, name), []):
        logger.debug("k8s_job: %s", k8s_job)

    if k8s_job is None:  # To tackle the race condition caused by Timer
        return "JobMetadataNotFound"

    if not k8s_job.get("status", {}):
        cron_job_status = "Activating"
    else:
        cron_job_status = "Active"

    if k8s_job.get("status"):
        for i in k8s_job.get("status", {}).get("active", [{}]):
            if i.get("name") not in status.get("orbitJobOperator",
                                               {}).get("cronJobIds", []):
                cron_job_ids: List[str] = status.get("orbitJobOperator",
                                                     {}).get("cronJobIds", [])
                cron_job_ids.append(i.get("name"))
                patch["status"] = {
                    "orbitJobOperator": {
                        "jobStatus": cron_job_status,
                        "jobName": k8s_job.get("name"),
                        "cronJobIds": cron_job_ids,
                    }
                }
            else:
                return cron_job_status
    else:
        patch["status"] = {
            "orbitJobOperator": {
                "jobStatus":
                cron_job_status,
                "jobName":
                k8s_job.get("name"),
                "cronJobIds":
                status.get("orbitJobOperator", {}).get("cronJobIds", []),
            }
        }

    return cron_job_status
Exemple #12
0
async def cluster_update(
    namespace: str,
    name: str,
    patch: kopf.Patch,
    status: kopf.Status,
    diff: kopf.Diff,
    **kwargs,
):
    """
    Handle cluster updates.

    This is done as a chain of sub-handlers that depend on the previous ones completing.
    The state of each handler is stored in the status field of the CrateDB
    custom resource. Since the status field persists between runs of this handler
    (even for unrelated runs), we calculate and store a hash of what changed as well.
    This hash is then used by the sub-handlers to work out which run they are part of.

    i.e., consider this status:

    ::

        status:
          cluster_update:
            ref: 24b527bf0eada363bf548f19b98dd9cb
          cluster_update/ensure_enabled_cronjob:
            ref: 24b527bf0eada363bf548f19b98dd9cb
            success: true
          cluster_update/ensure_no_backups:
            ref: 24b527bf0eada363bf548f19b98dd9cb
            success: true
          cluster_update/scale:
            ref: 24b527bf0eada363bf548f19b98dd9cb
            success: true


    here ``status.cluster_update.ref`` is the hash of the last diff that was being acted
    upon. Since kopf *does not clean up statuses*, when we start a new run we check if
    the hash matches - if not, it means we can disregard any refs that are not for this
    run.
    """
    context = status.get(CLUSTER_UPDATE_ID)
    hash = hashlib.md5(str(diff).encode("utf-8")).hexdigest()
    if not context:
        context = {"ref": hash}
    elif context.get("ref", "") != hash:
        context["ref"] = hash

    do_upgrade = False
    do_restart = False
    do_scale = False
    for _, field_path, *_ in diff:
        if field_path in {
            ("spec", "cluster", "imageRegistry"),
            ("spec", "cluster", "version"),
        }:
            do_upgrade = True
            do_restart = True
        elif field_path == ("spec", "nodes", "master", "replicas"):
            do_scale = True
        elif field_path == ("spec", "nodes", "data"):
            do_scale = True

    depends_on = [f"{CLUSTER_UPDATE_ID}/ensure_no_backups"]
    kopf.register(
        fn=EnsureNoBackupsSubHandler(namespace, name, hash, context)(),
        id="ensure_no_backups",
        timeout=config.SCALING_TIMEOUT,
    )

    if do_upgrade:
        kopf.register(
            fn=UpgradeSubHandler(
                namespace, name, hash, context, depends_on=depends_on.copy()
            )(),
            id="upgrade",
        )
        depends_on.append(f"{CLUSTER_UPDATE_ID}/upgrade")

    if do_restart:
        kopf.register(
            fn=RestartSubHandler(
                namespace, name, hash, context, depends_on=depends_on.copy()
            )(),
            id="restart",
            timeout=config.ROLLING_RESTART_TIMEOUT,
        )
        depends_on.append(f"{CLUSTER_UPDATE_ID}/restart")

    if do_scale:
        kopf.register(
            fn=ScaleSubHandler(
                namespace, name, hash, context, depends_on=depends_on.copy()
            )(),
            id="scale",
            timeout=config.SCALING_TIMEOUT,
        )
        depends_on.append(f"{CLUSTER_UPDATE_ID}/scale")

    kopf.register(
        fn=EnsureCronjobReenabled(
            namespace,
            name,
            hash,
            context,
            depends_on=depends_on.copy(),
            run_on_dep_failures=True,
        )(),
        id="ensure_enabled_cronjob",
    )

    patch.status[CLUSTER_UPDATE_ID] = context