def _monitor_k8s_job( status: kopf.Status, logger: kopf.Logger, **_: Any, ) -> bool: if (status.get("create_job", "")).startswith("Job"): if status.get("orbitJobOperator", {}).get("jobStatus", None) in ["Complete", "Failed"]: return False else: return True else: return False
def codebuild_monitor(status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger, **_: Any) -> str: replication = status.get("replication", {}) build_id = replication.get("codeBuildId", None) client = boto3.client("codebuild") build = client.batch_get_builds(ids=[build_id])["builds"][0] replication["codeBuildStatus"] = build["buildStatus"] replication["codeBuildPhase"] = build["currentPhase"] if replication["codeBuildStatus"] not in "IN_PROGRESS": logger.info("CodeBuildId: %s BuildStatus: %s", build_id, replication["codeBuildStatus"]) with LOCK: global WORKERS_IN_PROCESS WORKERS_IN_PROCESS -= 1 codebuild_attempts = replication.get("codeBuildAttempts", []) codebuild_attempts.append({ "codeBuildId": build_id, "codeBuildStatus": build["buildStatus"], "codeBuildPhase": build["currentPhase"], }) replication["codeBuildAttempts"] = codebuild_attempts replication["replicationStatus"] = "Complete" if build[ "buildStatus"] == "SUCCEEDED" else "Failed" if replication["replicationStatus"] == "Failed": replication["failureDelay"] = 30 patch["status"] = {"replication": replication} return cast(str, replication["codeBuildStatus"])
def codebuild_runner( spec: kopf.Spec, patch: kopf.Patch, status: kopf.Status, logger: kopf.Logger, **_: Any, ) -> str: replication = status.get("replication", {}) build_id, error = imagereplication_utils.replicate_image( src=spec["source"], dest=spec["destination"], config=CONFIG) replication["replicationStatus"] = "Replicating" replication["codeBuildId"] = build_id if error: replication["replicationStatus"] = "Failed" replication["failureDelay"] = 30 with LOCK: global WORKERS_IN_PROCESS WORKERS_IN_PROCESS -= 1 patch["status"] = {"replication": replication} if error: logger.error("CodeBuildId: %s Error: %s", build_id, error) else: logger.info("CodeBuildId: %s Error: %s", build_id, error) return cast(str, replication["replicationStatus"])
def scheduler(status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger, **_: Any) -> str: replication = status.get("replication", {}) replication["codeBuildStatus"] = None replication["codeBuildPhase"] = None replication["codeBuildId"] = None attempt = replication.get("attempt", 0) + 1 if attempt > CONFIG["max_replication_attempts"]: replication["replicationStatus"] = "MaxAttemptsExceeded" replication["attempt"] = attempt patch["status"] = {"replication": replication} else: with LOCK: global WORKERS_IN_PROCESS logger.debug("WORKERS_IN_PROCESS: %s", WORKERS_IN_PROCESS) if WORKERS_IN_PROCESS < CONFIG["workers"]: WORKERS_IN_PROCESS += 1 replication["replicationStatus"] = "Scheduled" replication["attempt"] = attempt patch["status"] = {"replication": replication} logger.info("Schedule Attempt: %s", replication["attempt"]) return cast(str, replication["replicationStatus"])
def _needs_rescheduling(status: kopf.Status, **_: Any) -> bool: replication = status.get("replication", None) if replication: replication_status = replication.get("replicationStatus", None) attempt = replication.get("attempt", 0) return cast( bool, replication_status == "Failed" and attempt <= CONFIG["max_replication_attempts"], ) else: return False
def imagereplications_idx(namespace: str, name: str, spec: kopf.Spec, status: kopf.Status, **_: Any) -> Dict[str, Any]: replication_status = status.get("replication", {}).get("replicationStatus", None) return { spec["destination"]: { "namespace": namespace, "name": name, "source": spec["source"], "replicationStatus": replication_status, } }
async def handle( # type: ignore self, namespace: str, name: str, body: kopf.Body, old: kopf.Body, logger: logging.Logger, status: kopf.Status, **kwargs: Any, ): disabler_job_status = None for key in status.keys(): if key.endswith(DISABLE_CRONJOB_HANDLER_ID): disabler_job_status = status.get(key) break if disabler_job_status is None: logger.info( "No cronjob was disabled, so can't re-enable anything.") return if disabler_job_status.get(IGNORE_CRONJOB, False): logger.warning("Will not attempt to re-enable any CronJobs") return async with ApiClient() as api_client: job_name = disabler_job_status[CRONJOB_NAME] batch = BatchV1beta1Api(api_client) jobs: V1beta1CronJobList = await batch.list_namespaced_cron_job( namespace) for job in jobs.items: if job.metadata.name == job_name: update = {"spec": {"suspend": False}} await batch.patch_namespaced_cron_job( job_name, namespace, update) logger.info(f"Re-enabled cronjob {job_name}")
def rescheduler(status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger, **_: Any) -> str: logger.debug("Rescheduling") replication = status.get("replication", {}) failure_delay = replication.get("failureDelay", 0) if failure_delay > 0: replication["failureDelay"] = failure_delay - 5 else: replication["replicationStatus"] = "Pending" replication["failureDelay"] = None patch["status"] = {"replication": replication} return "Rescheduled"
def replication_checker( spec: kopf.Spec, status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger, **_: Any, ) -> str: if status.get("replication", None) is not None: return cast(str, status["replication"].get("replicationStatus", "Unknown")) replication = {} if imagereplication_utils.image_replicated(image=spec["destination"], logger=logger): logger.info("Skipped: Image previously replicated to ECR") replication["replicationStatus"] = "ECRImageExists" else: logger.info("Starting Replication") replication["replicationStatus"] = "Pending" patch["status"] = {"replication": replication} return replication["replicationStatus"]
async def restart_cluster( core: CoreV1Api, namespace: str, name: str, old: kopf.Body, logger: logging.Logger, patch: kopf.Patch, status: kopf.Status, ) -> None: """ Perform a rolling restart of the CrateDB cluster ``name`` in ``namespace``. One node at a time, this function will terminate first the master nodes and then the data nodes in the cluster. After triggering a pod's termination, the operator will wait for that pod to be terminated and gone. It will then wait for the cluster to have the desired number of nodes again and for the cluster to be in a ``GREEN`` state, before terminating the next pod. :param core: An instance of the Kubernetes Core V1 API. :param namespace: The Kubernetes namespace where to look up CrateDB cluster. :param name: The CrateDB custom resource name defining the CrateDB cluster. :param old: The old resource body. """ pending_pods: List[Dict[str, str]] = status.get("pendingPods") or [] if not pending_pods: if "master" in old["spec"]["nodes"]: pending_pods.extend(await get_pods_in_statefulset( core, namespace, name, "master")) for node_spec in old["spec"]["nodes"]["data"]: pending_pods.extend(await get_pods_in_statefulset( core, namespace, name, node_spec["name"])) patch.status["pendingPods"] = pending_pods if not pending_pods: # We're all done patch.status[ "pendingPods"] = None # Remove attribute from status stanza return next_pod_uid = pending_pods[0]["uid"] next_pod_name = pending_pods[0]["name"] all_pod_uids, all_pod_names = await get_pods_in_cluster( core, namespace, name) if next_pod_uid in all_pod_uids: # The next to-be-terminated pod still appears to be running. logger.info("Terminating pod '%s'", next_pod_name) # Trigger deletion of Pod. # This may take a while as it tries to gracefully stop the containers # of the Pod. await core.delete_namespaced_pod(namespace=namespace, name=next_pod_name) raise kopf.TemporaryError( f"Waiting for pod {next_pod_name} ({next_pod_uid}) to be terminated.", delay=15, ) elif next_pod_name in all_pod_names: total_nodes = get_total_nodes_count(old["spec"]["nodes"]) # The new pod has been spawned. Only a matter of time until it's ready. password, host = await asyncio.gather( get_system_user_password(core, namespace, name), get_host(core, namespace, name), ) conn_factory = connection_factory(host, password) if await is_cluster_healthy(conn_factory, total_nodes, logger): pending_pods.pop(0) # remove the first item in the list if pending_pods: patch.status["pendingPods"] = pending_pods raise kopf.TemporaryError( "Scheduling rerun because there are pods to be restarted", delay=5) else: # We're all done patch.status[ "pendingPods"] = None # Remove attribute from `.status` return else: raise kopf.TemporaryError("Cluster is not healthy yet.", delay=30) else: raise kopf.TemporaryError( "Scheduling rerun because there are pods to be restarted", delay=15)
def orbit_cron_job_monitor( namespace: str, name: str, patch: kopf.Patch, status: kopf.Status, logger: kopf.Logger, namespaces_idx: kopf.Index[str, Dict[str, Any]], cron_jobs_idx: kopf.Index[Tuple[str, str], Dict[str, Any]], **_: Any, ) -> Any: ns: Optional[Dict[str, Any]] = None k8s_job: Optional[Dict[str, Any]] = None for ns in namespaces_idx.get(namespace, []): logger.debug("ns: %s", ns) if ns is None: patch["status"] = { "orbitJobOperator": { "jobStatus": "JobDetailsNotFound", "error": "No Namespace resource found" } } return "JobDetailsNotFound" logger.debug("cron_jobs_idx: %s", cron_jobs_idx) for k8s_job in cron_jobs_idx.get((namespace, name), []): logger.debug("k8s_job: %s", k8s_job) if k8s_job is None: # To tackle the race condition caused by Timer return "JobMetadataNotFound" if not k8s_job.get("status", {}): cron_job_status = "Activating" else: cron_job_status = "Active" if k8s_job.get("status"): for i in k8s_job.get("status", {}).get("active", [{}]): if i.get("name") not in status.get("orbitJobOperator", {}).get("cronJobIds", []): cron_job_ids: List[str] = status.get("orbitJobOperator", {}).get("cronJobIds", []) cron_job_ids.append(i.get("name")) patch["status"] = { "orbitJobOperator": { "jobStatus": cron_job_status, "jobName": k8s_job.get("name"), "cronJobIds": cron_job_ids, } } else: return cron_job_status else: patch["status"] = { "orbitJobOperator": { "jobStatus": cron_job_status, "jobName": k8s_job.get("name"), "cronJobIds": status.get("orbitJobOperator", {}).get("cronJobIds", []), } } return cron_job_status
async def cluster_update( namespace: str, name: str, patch: kopf.Patch, status: kopf.Status, diff: kopf.Diff, **kwargs, ): """ Handle cluster updates. This is done as a chain of sub-handlers that depend on the previous ones completing. The state of each handler is stored in the status field of the CrateDB custom resource. Since the status field persists between runs of this handler (even for unrelated runs), we calculate and store a hash of what changed as well. This hash is then used by the sub-handlers to work out which run they are part of. i.e., consider this status: :: status: cluster_update: ref: 24b527bf0eada363bf548f19b98dd9cb cluster_update/ensure_enabled_cronjob: ref: 24b527bf0eada363bf548f19b98dd9cb success: true cluster_update/ensure_no_backups: ref: 24b527bf0eada363bf548f19b98dd9cb success: true cluster_update/scale: ref: 24b527bf0eada363bf548f19b98dd9cb success: true here ``status.cluster_update.ref`` is the hash of the last diff that was being acted upon. Since kopf *does not clean up statuses*, when we start a new run we check if the hash matches - if not, it means we can disregard any refs that are not for this run. """ context = status.get(CLUSTER_UPDATE_ID) hash = hashlib.md5(str(diff).encode("utf-8")).hexdigest() if not context: context = {"ref": hash} elif context.get("ref", "") != hash: context["ref"] = hash do_upgrade = False do_restart = False do_scale = False for _, field_path, *_ in diff: if field_path in { ("spec", "cluster", "imageRegistry"), ("spec", "cluster", "version"), }: do_upgrade = True do_restart = True elif field_path == ("spec", "nodes", "master", "replicas"): do_scale = True elif field_path == ("spec", "nodes", "data"): do_scale = True depends_on = [f"{CLUSTER_UPDATE_ID}/ensure_no_backups"] kopf.register( fn=EnsureNoBackupsSubHandler(namespace, name, hash, context)(), id="ensure_no_backups", timeout=config.SCALING_TIMEOUT, ) if do_upgrade: kopf.register( fn=UpgradeSubHandler( namespace, name, hash, context, depends_on=depends_on.copy() )(), id="upgrade", ) depends_on.append(f"{CLUSTER_UPDATE_ID}/upgrade") if do_restart: kopf.register( fn=RestartSubHandler( namespace, name, hash, context, depends_on=depends_on.copy() )(), id="restart", timeout=config.ROLLING_RESTART_TIMEOUT, ) depends_on.append(f"{CLUSTER_UPDATE_ID}/restart") if do_scale: kopf.register( fn=ScaleSubHandler( namespace, name, hash, context, depends_on=depends_on.copy() )(), id="scale", timeout=config.SCALING_TIMEOUT, ) depends_on.append(f"{CLUSTER_UPDATE_ID}/scale") kopf.register( fn=EnsureCronjobReenabled( namespace, name, hash, context, depends_on=depends_on.copy(), run_on_dep_failures=True, )(), id="ensure_enabled_cronjob", ) patch.status[CLUSTER_UPDATE_ID] = context