async def _ensure_cronjob_suspended( namespace: str, name: str, logger: logging.Logger) -> Optional[Dict]: async with ApiClient() as api_client: batch = BatchV1beta1Api(api_client) jobs: V1beta1CronJobList = await batch.list_namespaced_cron_job( namespace) for job in jobs.items: job_name = job.metadata.name labels = job.metadata.labels if (labels.get("app.kubernetes.io/component") == "backup" and labels.get("app.kubernetes.io/name") == name): current_suspend_status = job.spec.suspend if current_suspend_status: logger.warn( f"Found job {job_name} that is already suspended, ignoring" ) return { CRONJOB_NAME: job_name, CRONJOB_SUSPENDED: True, IGNORE_CRONJOB: True, } logger.info(f"Temporarily suspending CronJob {job_name} " f"while cluster update in progress") update = {"spec": {"suspend": True}} await batch.patch_namespaced_cron_job( job_name, namespace, update) return {CRONJOB_NAME: job_name, CRONJOB_SUSPENDED: True} return None
async def _ensure_no_backup_cronjobs_running(self, namespace: str, name: str, logger: logging.Logger): async with ApiClient() as api_client: batch = BatchV1Api(api_client) jobs: V1JobList = await call_kubeapi(batch.list_namespaced_job, logger, namespace=namespace) for job in jobs.items: job_name = job.metadata.name labels = job.metadata.labels job_status: V1JobStatus = job.status if (labels.get("app.kubernetes.io/component") == "backup" and labels.get("app.kubernetes.io/name") == name and job_status.active is not None): await kopf.execute( fns={ "notify_backup_running": subhandler_partial(self._notify_backup_running, logger) }) raise kopf.TemporaryError( "A snapshot k8s job is currently running, " f"waiting for it to finish: {job_name}", delay=30, )
async def k8s_secrets(namespace: str) -> List[str]: async with ApiClient() as api: v1 = k8s.client.CoreV1Api(api) secrets = await v1.list_namespaced_secret(namespace) names = [n.metadata.name for n in secrets.items] return names
async def main(): args = parse_args() loader = await config.load_kube_config() api = ApiClient() v1_api = client.CoreV1Api(api) ret = await v1_api.list_namespaced_pod(args.namespace) cmd = [] for pod in ret.items: if pod.metadata.name.startswith(args.pod): for container in pod.spec.containers: cmd.append( print_pod_log(v1_api, pod.metadata.name, args.namespace, container.name, args.lines, args.follow)) if cmd == []: print('No matching PODs !') return if args.follow: # autorefresh gcp token cmd.append(config.refresh_token(loader)) await asyncio.wait(cmd) await api.close()
async def create_services( owner_references: Optional[List[V1OwnerReference]], namespace: str, name: str, labels: LabelType, http_port: int, postgres_port: int, transport_port: int, dns_record: Optional[str], logger: logging.Logger, ) -> None: async with ApiClient() as api_client: core = CoreV1Api(api_client) await call_kubeapi( core.create_namespaced_service, logger, continue_on_conflict=True, namespace=namespace, body=get_data_service(owner_references, name, labels, http_port, postgres_port, dns_record), ) await call_kubeapi( core.create_namespaced_service, logger, continue_on_conflict=True, namespace=namespace, body=get_discovery_service(owner_references, name, labels, transport_port), )
async def api_client(context=None, api_client_kwargs=None): await config.load_kube_config(config_file=KUBECONFIG_FILE) context = context or {} context['core_api'] = True api_client_kwargs = api_client_kwargs or {} api_client_kwargs.setdefault('request_timeout', 50) api_cl = ApiClient(**api_client_kwargs) user_context = { 'core_api': client.CoreV1Api(api_cl), 'apps_api': client.AppsV1Api(api_cl), 'storage_api': client.StorageV1Api(api_cl), 'batch_api': client.BatchV1Api(api_cl), 'cronjob_batch_api': client.BatchV1beta1Api(api_cl), 'custom_object_api': client.CustomObjectsApi(api_cl), 'extensions_api': client.ApiextensionsV1Api(api_cl), } try: for k in filter(lambda k: context[k], context): if k == 'node': user_context[k] = await get_node(user_context['core_api']) yield api_cl, user_context finally: await api_cl.close()
async def k8s_namespaces() -> List[str]: async with ApiClient() as api: v1 = k8s.client.CoreV1Api(api) namespaces = await v1.list_namespace() names = [n.metadata.name for n in namespaces.items] return names
async def create_debug_volume( owner_references: Optional[List[V1OwnerReference]], namespace: str, name: str, labels: LabelType, logger: logging.Logger, ) -> None: """ Creates a ``PersistentVolume`` and ``PersistentVolumeClaim`` to be used for exporting Java Heapdumps from CrateDB. The volume can be configured with the :attr:`~crate.operator.config.Config.DEBUG_VOLUME_SIZE` and :attr:`~crate.operator.config.Config.DEBUG_VOLUME_STORAGE_CLASS` settings. """ async with ApiClient() as api_client: core = CoreV1Api(api_client) await call_kubeapi( core.create_persistent_volume, logger, continue_on_conflict=True, body=get_debug_persistent_volume(owner_references, namespace, name, labels), ) await call_kubeapi( core.create_namespaced_persistent_volume_claim, logger, continue_on_conflict=True, namespace=namespace, body=get_debug_persistent_volume_claim(owner_references, name, labels), )
async def k8s_get_service(namespace: str, name: str) -> Optional[V1Service]: async with ApiClient() as api: v1 = k8s.client.CoreV1Api(api) services = await v1.list_namespaced_service(namespace) service = next((s for s in services.items if s.metadata.name == name), None) return service
async def main(): # Configs can be set in Configuration class directly or using helper # utility. If no argument provided, the config will be loaded from # default location. await config.load_kube_config() # use the context manager to close http sessions automatically async with ApiClient() as api: v1 = client.CoreV1Api(api) print("Listing pods with their IPs:") ret = await v1.list_pod_for_all_namespaces() for i in ret.items: print(i.status.pod_ip, i.metadata.namespace, i.metadata.name)
async def create_backups( owner_references: Optional[List[V1OwnerReference]], namespace: str, name: str, labels: LabelType, http_port: int, prometheus_port: int, backups: Dict[str, Any], image_pull_secrets: Optional[List[V1LocalObjectReference]], has_ssl: bool, logger: logging.Logger, ) -> None: backup_aws = backups.get("aws") async with ApiClient() as api_client: apps = AppsV1Api(api_client) batchv1_beta1 = BatchV1beta1Api(api_client) if backup_aws: await call_kubeapi( batchv1_beta1.create_namespaced_cron_job, logger, continue_on_conflict=True, namespace=namespace, body=get_backup_cronjob( owner_references, name, labels, http_port, backup_aws, image_pull_secrets, has_ssl, ), ) await call_kubeapi( apps.create_namespaced_deployment, logger, continue_on_conflict=True, namespace=namespace, body=get_backup_metrics_exporter( owner_references, name, labels, http_port, prometheus_port, backup_aws, image_pull_secrets, has_ssl, ), )
async def create_sql_exporter_config( owner_references: Optional[List[V1OwnerReference]], namespace: str, name: str, labels: LabelType, logger: logging.Logger, ) -> None: async with ApiClient() as api_client: core = CoreV1Api(api_client) await call_kubeapi( core.create_namespaced_config_map, logger, continue_on_conflict=True, namespace=namespace, body=get_sql_exporter_config(owner_references, name, labels), )
async def handle( # type: ignore self, namespace: str, name: str, old: kopf.Body, logger: logging.Logger, patch: kopf.Patch, status: kopf.Status, **kwargs: Any, ): async with ApiClient() as api_client: core = CoreV1Api(api_client) await restart_cluster(core, namespace, name, old, logger, patch, status) await self.send_notifications(logger)
async def secret_update( namespace: str, name: str, diff: kopf.Diff, logger: logging.Logger, **kwargs, ): async with ApiClient() as api_client: coapi = CustomObjectsApi(api_client) core = CoreV1Api(api_client) for operation, field_path, old_value, new_value in diff: custom_objects = await coapi.list_namespaced_custom_object( namespace=namespace, group=API_GROUP, version="v1", plural=RESOURCE_CRATEDB, ) for crate_custom_object in custom_objects["items"]: host = await get_host( core, namespace, crate_custom_object["metadata"]["name"] ) for user_spec in crate_custom_object["spec"]["users"]: expected_field_path = ( "data", user_spec["password"]["secretKeyRef"]["key"], ) if ( user_spec["password"]["secretKeyRef"]["name"] == name and field_path == expected_field_path ): kopf.register( fn=subhandler_partial( update_user_password, host, user_spec["name"], old_value, new_value, logger, ), id=f"update-{crate_custom_object['metadata']['name']}-{user_spec['name']}", # noqa timeout=config.BOOTSTRAP_TIMEOUT, )
async def api_client(context=None, api_client_kwargs=None): await config.load_kube_config(config_file=KUBECONFIG_FILE) context = context or {} context['core_api'] = True api_cl = ApiClient(**(api_client_kwargs or {})) user_context = { 'core_api': client.CoreV1Api(api_cl), 'apps_api': client.AppsV1Api(api_cl), 'storage_api': client.StorageV1Api(api_cl), } for k in filter(lambda k: context[k], context): if k == 'node': user_context[k] = await get_node(user_context['core_api']) try: yield api_cl, user_context finally: await api_cl.close()
async def create_system_user( owner_references: Optional[List[V1OwnerReference]], namespace: str, name: str, labels: LabelType, logger: logging.Logger, ) -> None: """ The *CrateDB Operator* will need to perform operations on the CrateDB cluster. For that, it will use a ``system`` user who's credentials are created here. """ async with ApiClient() as api_client: core = CoreV1Api(api_client) await call_kubeapi( core.create_namespaced_secret, logger, continue_on_conflict=True, namespace=namespace, body=get_system_user_secret(owner_references, name, labels), )
async def bootstrap_cluster( namespace: str, name: str, master_node_pod: str, license: Optional[SecretKeyRefContainer], has_ssl: bool, users: Optional[List[Dict[str, Any]]], logger: logging.Logger, ): """ Bootstrap an entire cluster, including license, system user, and additional users. :param namespace: The Kubernetes namespace for the CrateDB cluster. :param name: The name for the ``CrateDB`` custom resource. Used to lookup the password for the system user created during deployment. :param master_node_pod: The pod name of one of the eligible master nodes in the cluster. Used to ``exec`` into. :param license: An optional ``secretKeyRef`` to the Kubernetes secret that holds the CrateDB license key. :param has_ssl: When ``True``, ``crash`` will establish a connection to the CrateDB cluster from inside the ``crate`` container using SSL/TLS. This must match how the cluster is configured, otherwise ``crash`` won't be able to connect, since non-encrypted connections are forbidden when SSL/TLS is enabled, and encrypted connections aren't possible when no SSL/TLS is configured. :param users: An optional list of user definitions containing the username and the secret key reference to their password. """ # We first need to set the license, in case the CrateDB cluster # contains more nodes than available in the free license. async with ApiClient() as api_client: core = CoreV1Api(api_client) if license: await bootstrap_license(core, namespace, master_node_pod, has_ssl, license, logger) await bootstrap_system_user(core, namespace, name, master_node_pod, has_ssl, logger) if users: await bootstrap_users(core, namespace, name, users)
async def update_cratedb_resource( namespace: str, name: str, spec: kopf.Spec, **kwargs, ): if "users" in spec: async with ApiClient() as api_client: for user_spec in spec["users"]: core = CoreV1Api(api_client) secret_name = user_spec["password"]["secretKeyRef"]["name"] secret = await core.read_namespaced_secret( namespace=namespace, name=secret_name ) if ( secret.metadata.labels is None or LABEL_USER_PASSWORD not in secret.metadata.labels ): await ensure_user_password_label( core, namespace, user_spec["password"]["secretKeyRef"]["name"] )
async def handle( # type: ignore self, namespace: str, name: str, body: kopf.Body, old: kopf.Body, logger: logging.Logger, status: kopf.Status, **kwargs: Any, ): disabler_job_status = None for key in status.keys(): if key.endswith(DISABLE_CRONJOB_HANDLER_ID): disabler_job_status = status.get(key) break if disabler_job_status is None: logger.info( "No cronjob was disabled, so can't re-enable anything.") return if disabler_job_status.get(IGNORE_CRONJOB, False): logger.warning("Will not attempt to re-enable any CronJobs") return async with ApiClient() as api_client: job_name = disabler_job_status[CRONJOB_NAME] batch = BatchV1beta1Api(api_client) jobs: V1beta1CronJobList = await batch.list_namespaced_cron_job( namespace) for job in jobs.items: if job.metadata.name == job_name: update = {"spec": {"suspend": False}} await batch.patch_namespaced_cron_job( job_name, namespace, update) logger.info(f"Re-enabled cronjob {job_name}")
async def delete_deployment(self, deployment_id: str): assert self.auth_client assert self.cluster_endpoint cfg = client.Configuration( host=f"https://{self.cluster_endpoint}:443", api_key={ "authorization": f"Bearer {await self.auth_client.get()}" }, ) cfg.verify_ssl = False async with ApiClient(configuration=cfg) as kube_api: apps_api = client.AppsV1Api(kube_api) core_api = client.CoreV1Api(kube_api) # Delete service service_id = f"{deployment_id}-svc" await core_api.delete_namespaced_service(name=service_id, namespace=KUBE_NAMESPACE) # Delete deployment await apps_api.delete_namespaced_deployment( name=deployment_id, namespace=KUBE_NAMESPACE)
async def _ensure_no_snapshots_in_progress(self, namespace, name, logger): async with ApiClient() as api_client: core = CoreV1Api(api_client) host = await get_host(core, namespace, name) password = await get_system_user_password(core, namespace, name) conn_factory = connection_factory(host, password) snapshots_in_progress, statement = await are_snapshots_in_progress( conn_factory, logger) if snapshots_in_progress: # Raising a TemporaryError will clear any registered subhandlers, so we # execute this one directly instead to make sure it runs. # The same guarantees about it being executed only once still stand. await kopf.execute( fns={ "notify_backup_running": subhandler_partial(self._notify_backup_running, logger) }) raise kopf.TemporaryError( "A snapshot is currently in progress, " f"waiting for it to finish: {statement}", delay=30, )
def __init__(self, api_client=None): if api_client is None: api_client = ApiClient() self.api_client = api_client
async def create_statefulset( owner_references: Optional[List[V1OwnerReference]], namespace: str, name: str, labels: LabelType, treat_as_master: bool, treat_as_data: bool, cluster_name: str, node_name: str, node_name_prefix: str, node_spec: Dict[str, Any], master_nodes: List[str], total_nodes_count: int, http_port: int, jmx_port: int, postgres_port: int, prometheus_port: int, transport_port: int, crate_image: str, ssl: Optional[Dict[str, Any]], cluster_settings: Optional[Dict[str, str]], image_pull_secrets: Optional[List[V1LocalObjectReference]], logger: logging.Logger, ) -> None: async with ApiClient() as api_client: apps = AppsV1Api(api_client) await call_kubeapi( apps.create_namespaced_stateful_set, logger, continue_on_conflict=True, namespace=namespace, body=get_statefulset( owner_references, namespace, name, labels, treat_as_master, treat_as_data, cluster_name, node_name, node_name_prefix, node_spec, master_nodes, total_nodes_count, http_port, jmx_port, postgres_port, prometheus_port, transport_port, crate_image, ssl, cluster_settings, image_pull_secrets, logger, ), ) policy = PolicyV1beta1Api(api_client) pdb = V1beta1PodDisruptionBudget( metadata=V1ObjectMeta( name=f"crate-{name}", owner_references=owner_references, ), spec=V1beta1PodDisruptionBudgetSpec( max_unavailable=1, selector=V1LabelSelector( match_labels={ LABEL_COMPONENT: "cratedb", LABEL_NAME: name, LABEL_NODE_NAME: node_name, }), ), ) """ A Pod Distruption Budget ensures that when performing Kubernetes cluster maintenance (i.e. upgrades), we make sure to not disrupt more than 1 pod in a StatefulSet at a time. """ await call_kubeapi( policy.create_namespaced_pod_disruption_budget, logger, continue_on_conflict=True, namespace=namespace, body=pdb, )
async def handle( # type: ignore self, namespace: str, name: str, spec: kopf.Spec, old: kopf.Body, diff: kopf.Diff, logger: logging.Logger, **kwargs: Any, ): scale_master_diff_item: Optional[kopf.DiffItem] = None scale_data_diff_items: Optional[List[kopf.DiffItem]] = None for operation, field_path, old_value, new_value in diff: if field_path == ("spec", "nodes", "master", "replicas"): scale_master_diff_item = kopf.DiffItem( operation, field_path, old_value, new_value ) elif field_path == ("spec", "nodes", "data"): # TODO: check for data node order, added or removed types, ... if len(old_value) != len(new_value): raise kopf.PermanentError( "Adding and removing node specs is not supported at this time." ) scale_data_diff_items = [] for node_spec_idx in range(len(old_value)): old_spec = old_value[node_spec_idx] new_spec = new_value[node_spec_idx] inner_diff = calc_diff(old_spec, new_spec) for ( inner_operation, inner_field_path, inner_old_value, inner_new_value, ) in inner_diff: if inner_field_path == ("replicas",): scale_data_diff_items.append( kopf.DiffItem( inner_operation, (str(node_spec_idx),) + inner_field_path, inner_old_value, inner_new_value, ) ) else: logger.info( "Ignoring operation %s on field %s", operation, field_path + (str(node_spec_idx),) + inner_field_path, ) else: logger.info("Ignoring operation %s on field %s", operation, field_path) async with ApiClient() as api_client: apps = AppsV1Api(api_client) core = CoreV1Api(api_client) await scale_cluster( apps, core, namespace, name, old, scale_master_diff_item, (kopf.Diff(scale_data_diff_items) if scale_data_diff_items else None), logger, ) self.schedule_notification( WebhookEvent.SCALE, WebhookScalePayload( old_data_replicas=[ WebhookScaleNodePayload( name=item["name"], replicas=item["replicas"] ) for item in old["spec"]["nodes"]["data"] ], new_data_replicas=[ WebhookScaleNodePayload( name=item["name"], replicas=item["replicas"] ) for item in spec["nodes"]["data"] ], old_master_replicas=old["spec"]["nodes"] .get("master", {}) .get("replicas"), new_master_replicas=spec["nodes"].get("master", {}).get("replicas"), ), WebhookStatus.SUCCESS, ) await self.send_notifications(logger)
async def create_deployment( self, container: str, num_replicas: int, cpus: float = 1.0, memory: float = 1.0, ) -> Tuple[str, str]: assert self.auth_client assert self.cluster_endpoint cfg = client.Configuration( host=f"https://{self.cluster_endpoint}:443", api_key={ "authorization": f"Bearer {await self.auth_client.get()}" }, ) cfg.verify_ssl = False async with ApiClient(configuration=cfg) as kube_api: apps_api = client.AppsV1Api(kube_api) core_api = client.CoreV1Api(kube_api) # Create deployment deployment_id = f"dep-{uuid.uuid4()}" deployment = client.V1Deployment( api_version="apps/v1", kind="Deployment", metadata=client.V1ObjectMeta(name=deployment_id), spec=client.V1DeploymentSpec( replicas=num_replicas, selector={"matchLabels": { "dep": deployment_id }}, template=client.V1PodTemplateSpec( metadata=client.V1ObjectMeta( labels={"dep": deployment_id}), spec=client.V1PodSpec(containers=[ client.V1Container( name=deployment_id, env=[ client.V1EnvVar(name="PORT", value=str(INTERNAL_PORT)) ], image=container, resources=client.V1ResourceRequirements( requests={ "cpu": str(cpus), "memory": f"{int(memory * 1024)}M", }), ports=[ client.V1ContainerPort( container_port=INTERNAL_PORT) ], ) ]), ), ), ) await apps_api.create_namespaced_deployment( namespace=KUBE_NAMESPACE, body=deployment) # Create service service_id = f"{deployment_id}-svc" service_port = self.get_unassigned_port() service = client.V1Service( api_version="v1", kind="Service", metadata=client.V1ObjectMeta( name=service_id, # annotations={"cloud.google.com/load-balancer-type": "Internal"}, ), spec=client.V1ServiceSpec( selector={"dep": deployment_id}, ports=[ client.V1ServicePort( protocol="TCP", port=service_port, target_port=INTERNAL_PORT, ) ], type="LoadBalancer", ), ) await core_api.create_namespaced_service(namespace=KUBE_NAMESPACE, body=service) # Poll for external URL service_ip = None while not service_ip: await asyncio.sleep(POLL_INTERVAL) ingress = (await core_api.read_namespaced_service( name=service_id, namespace=KUBE_NAMESPACE)).status.load_balancer.ingress if ingress: service_ip = ingress[0].ip service_url = f"http://{service_ip}:{service_port}" print(f"Started deployment {deployment_id} at {service_url}") return deployment_id, service_url
async def k8s_asyncio_api_client(kube_config) -> ApiClient: async with ApiClient() as api_client: yield api_client
async def k8s_create_namespace(name: str): async with ApiClient() as api: v1 = k8s.client.CoreV1Api(api) await v1.create_namespace(body=V1Namespace(metadata=dict(name=name)))
async def k8s_access_token(): await k8s_async_config.load_kube_config() async with ApiClient() as api: api_key = api.configuration.api_key return api_key['authorization'].strip('Bearer').strip()