def __init__( self, kube_config, task_queue, result_queue, kube_client, worker_uuid, kube_dbnd ): super(DbndKubernetesScheduler, self).__init__( kube_config, task_queue, result_queue, kube_client, worker_uuid ) self.kube_dbnd = kube_dbnd # PATCH watcher communication manager # we want to wait for stop, instead of "exit" inplace, so we can get all "not" received messages from multiprocessing.managers import SyncManager # TODO: why can't we use original SyncManager? # Scheduler <-> (via _manager) KubeWatcher # if _manager dies inplace, we will not get any "info" from KubeWatcher until shutdown self._manager = SyncManager() self._manager.start(mgr_init) self.watcher_queue = self._manager.Queue() self.current_resource_version = 0 self.kube_watcher = self._make_kube_watcher_dbnd() # pod to airflow key (dag_id, task_id, execution_date) self.submitted_pods = {} # type: Dict[str,SubmittedPodState] # sending data to databand tracker self.metrics_logger = KubernetesMetricsLogger() # disappeared pods mechanism self.last_disappeared_pods = {} self.current_iteration = 1 # add `k8s-scheduler:` prefix to all log messages self._log = PrefixLoggerAdapter("k8s-scheduler", self.log)
def __init__(self): super(ClearZombieTaskInstancesForDagRun, self).__init__() self.zombie_threshold_secs = conf.getint( "scheduler", "scheduler_zombie_task_threshold" ) self.zombie_query_interval_secs = 60 self._last_zombie_query_time = None self._log = PrefixLoggerAdapter("clear-zombies", self.log)
def run(self): """ Performs watching This code runs in separate process, while being forked form the main one Whatever clients we had in the main process they might require reset before we use them """ self._log = PrefixLoggerAdapter("k8s-watcher", self.log) from targets.fs import reset_fs_cache # we are in the different process than Scheduler # 1. Must reset filesystem cache to avoid using out-of-cluster credentials within Kubernetes reset_fs_cache() # DBND-AIRFLOW: this code might run as part of dbnd task and # this process is spawn from context of the task # Must reset signal handlers to avoid driver and watcher sharing signal handlers signal.signal(signal.SIGINT, watcher_sig_handler) signal.signal(signal.SIGTERM, watcher_sig_handler) signal.signal(signal.SIGQUIT, watcher_sig_handler) self.log.info( "Event: and now my watch begins starting at resource_version: %s. Watcher PID: %s", self.resource_version, os.getpid(), ) # we want a new refreshed client! kube_client = self.kube_dbnd.engine_config.get_kube_client() try: while True: try: if AIRFLOW_VERSION_2: job_uid = self.scheduler_job_id else: job_uid = self.worker_uuid self.resource_version = self._run(kube_client, self.resource_version, job_uid, self.kube_config) except DatabandSigTermError: break except Exception: self.log.exception( "Unknown error in KubernetesJobWatcher. Failing") raise else: self.log.info( "KubernetesWatcher restarting with resource_version: %s in %s seconds", self.resource_version, self.kube_dbnd.engine_config. watcher_recreation_interval_seconds, ) time.sleep(self.kube_dbnd.engine_config. watcher_recreation_interval_seconds) except (KeyboardInterrupt, DatabandSigTermError): pass
def __init__( self, dag, execution_date, mark_success=False, donot_pickle=False, ignore_first_depends_on_past=False, ignore_task_deps=False, fail_fast=True, pool=None, delay_on_limit_secs=1.0, verbose=False, airflow_config=None, *args, **kwargs ): self.dag = dag self.dag_id = dag.dag_id self.execution_date = execution_date self.mark_success = mark_success self.donot_pickle = donot_pickle self.ignore_first_depends_on_past = ignore_first_depends_on_past self.ignore_task_deps = ignore_task_deps self.fail_fast = fail_fast self.pool = pool self.delay_on_limit_secs = delay_on_limit_secs self.verbose = verbose self.terminating = False super(SingleDagRunJob, self).__init__(*args, **kwargs) self._logged_count = 0 # counter for status update self._logged_status = "" # last printed status self.ti_state_manager = AirflowTaskInstanceStateManager() self.airflow_config = airflow_config # type: AirflowConfig if ( self.airflow_config.clean_zombie_task_instances and "KubernetesExecutor" in self.executor_class ): self._runtime_k8s_zombie_cleaner = ClearKubernetesRuntimeZombiesForDagRun( k8s_executor=self.executor ) logger.info( "Zombie cleaner is enabled. " "It runs every %s seconds, threshold is %s seconds", self._runtime_k8s_zombie_cleaner.zombie_query_interval_secs, self._runtime_k8s_zombie_cleaner.zombie_threshold_secs, ) else: self._runtime_k8s_zombie_cleaner = None self._log = PrefixLoggerAdapter("scheduler", self.log)
def __init__(self, kube_dbnd=None): # type: (DbndKubernetesExecutor, DbndKubernetesClient) -> None from os import environ # This env variable is required for airflow's kubernetes configuration validation environ["AIRFLOW__KUBERNETES__DAGS_IN_IMAGE"] = "True" super(DbndKubernetesExecutor, self).__init__() self.kube_dbnd = kube_dbnd _update_airflow_kube_config(airflow_kube_config=self.kube_config, engine_config=kube_dbnd.engine_config) self._log = PrefixLoggerAdapter("k8s-executor", self.log)
def __init__(self, k8s_executor): super(ClearKubernetesRuntimeZombiesForDagRun, self).__init__() self._last_zombie_query_time = None self.k8s_executor = k8s_executor # type: DbndKubernetesExecutor # time configurations self.zombie_threshold_secs = ( k8s_executor.kube_dbnd.engine_config.zombie_threshold_secs) self.zombie_query_interval_secs = ( k8s_executor.kube_dbnd.engine_config.zombie_query_interval_secs) self._pending_zombies_timeout = ( k8s_executor.kube_dbnd.engine_config.pending_zombies_timeout) self._log = PrefixLoggerAdapter("clear-zombies", self.log)
def __init__(self, pod_name, pod_namespace, kube_config, kube_client): self.kube_config = kube_config # type: KubernetesEngineConfig self.name = pod_name self.namespace = pod_namespace self.kube_client = kube_client self.log = PrefixLoggerAdapter("pod %s" % self.name, logger)
class DbndPodCtrl(object): def __init__(self, pod_name, pod_namespace, kube_config, kube_client): self.kube_config = kube_config # type: KubernetesEngineConfig self.name = pod_name self.namespace = pod_namespace self.kube_client = kube_client self.log = PrefixLoggerAdapter("pod %s" % self.name, logger) def delete_pod(self): if self.kube_config.keep_finished_pods: self.log.warning( "Will not delete pod due to keep_finished_pods=True.") return if self.kube_config.keep_failed_pods: pod_phase = self.get_pod_phase() if pod_phase not in {PodPhase.RUNNING, PodPhase.SUCCEEDED}: self.log.warning( "Keeping failed pod due to keep_failed_pods=True and state is %s", pod_phase, ) return try: self.kube_client.delete_namespaced_pod( self.name, self.namespace, body=client.V1DeleteOptions()) self.log.info("Pod has been deleted.") except ApiException as e: self.log.info("Failed to delete pod: %s", e if e.status != 404 else "pod not found") # If the pod is already deleted, don't raise # if e.status != 404: # raise def get_pod_status_v1(self): # type: () -> Optional[V1Pod] try: return self.kube_client.read_namespaced_pod( name=self.name, namespace=self.namespace) except ApiException as e: # If the pod can not be found if e.status == 404: return None raise def get_pod_phase(self): pod_resp = self.get_pod_status_v1() if not pod_resp: return None return pod_resp.status.phase def _wait_for_pod_started(self, _logger=None): """ will try to raise an exception if the pod fails to start (see DbndPodLauncher.check_deploy_errors) """ _logger = _logger or self.log start_time = datetime.now() while True: pod_status = self.get_pod_status_v1() if not pod_status: raise DatabandError("Can not find pod at k8s:%s") # PATCH: validate deploy errors self.check_deploy_errors(pod_status) pod_phase = pod_status.status.phase if pod_phase.lower() != PodStatus.PENDING: return startup_delta = datetime.now() - start_time if startup_delta >= self.kube_config.startup_timeout: raise DatabandError("Pod is still not running after %s" % startup_delta) time.sleep(1) _logger.debug("Pod not yet started: %s", pod_status.status) def stream_pod_logs(self, print_func=logger.info, follow=False, tail_lines=None): kwargs = { "name": self.name, "namespace": self.namespace, "container": "base", "follow": follow, "_preload_content": False, } if tail_lines: kwargs["tail_lines"] = tail_lines logs = self.kube_client.read_namespaced_pod_log(**kwargs) try: if self.kube_config.prefix_remote_log: # we want to remove regular header in log, and make it looks like '[pod_name] LOG FROM POD' prefix = "[%s]" % self.name with override_log_formatting(prefix + "%(message)s"): for line in logs: print_func(line[:-1].decode("utf-8")) else: for line in logs: print_func(line[:-1].decode("utf-8")) except Exception as ex: self.log.error("Failed to stream logs: %s", self.name, ex) def check_deploy_errors(self, pod_v1_resp): pod_status = pod_v1_resp.status if self.kube_config.check_unschedulable_condition and pod_status.conditions: for condition in pod_status.conditions: if condition.reason != "Unschedulable": continue logger.info("pod is pending because %s" % condition.message) if ("Insufficient cpu" in condition.message or "Insufficient memory" in condition.message): if self.kube_config.check_cluster_resource_capacity: kube_resources_checker = DbndKubeResourcesChecker( kube_client=self.kube_client, kube_config=self.kube_config) kube_resources_checker.check_if_resource_request_above_max_capacity( condition.message) self.log.warning("pod is pending because %s" % condition.message) else: raise friendly_error.executor_k8s.kubernetes_pod_unschedulable( condition.message) if pod_status.container_statuses: container_waiting_state = pod_status.container_statuses[ 0].state.waiting if (self.kube_config.check_image_pull_errors and pod_status.phase == "Pending" and container_waiting_state): if container_waiting_state.reason == "ErrImagePull": raise friendly_error.executor_k8s.kubernetes_image_not_found( pod_status.container_statuses[0].image, container_waiting_state.message, long_msg=container_waiting_state.reason, ) if container_waiting_state.reason == "CreateContainerConfigError": raise friendly_error.executor_k8s.kubernetes_pod_config_error( container_waiting_state.message) def check_running_errors(self, pod_v1_resp): """ Raise an error if pod in running state with Failed conditions """ pod_status = pod_v1_resp.status if not self.kube_config.check_running_pod_errors: return if pod_status.conditions: for condition in pod_status.conditions: if condition.type != "Ready": continue # We are looking for # { # u"status": u"False", # u"lastProbeTime": None, # u"type": u"Ready", # u"lastTransitionTime": u"2021-01-22T04:54:13Z", # }, if not condition.status or condition.status == "False": raise friendly_error.executor_k8s.kubernetes_running_pod_fails_on_condition( condition, pod_name=pod_v1_resp.metadata.name) return True return False def wait(self): """ Waits for pod completion :return: """ self._wait_for_pod_started() self.log.info("Pod is running, reading logs..") self.stream_pod_logs(follow=True) self.log.info("Successfully read pod logs") pod_phase = self.get_pod_phase() wait_start = utcnow() while pod_phase not in {PodPhase.SUCCEEDED, PodPhase.FAILED}: logger.debug( "Pod '%s' is not completed with state %s, waiting..", self.name, pod_phase, ) if (utcnow() - wait_start ) > self.kube_config.submit_termination_grace_period: raise DatabandRuntimeError( "Pod is not in a final state after {grace_period}: {state}" .format( grace_period=self.kube_config. submit_termination_grace_period, state=pod_phase, )) time.sleep(5) pod_phase = self.get_pod_phase() if pod_phase != PodPhase.SUCCEEDED: raise DatabandRuntimeError( "Pod returned a failure: {pod_phase}".format( pod_phase=pod_phase)) return self def run_pod(self, task_run: "TaskRun", pod: "k8s.V1Pod", detach_run: bool = False) -> "DbndPodCtrl": kc = self.kube_config detach_run = detach_run or kc.detach_run if not self.is_possible_to_detach_run(): detach_run = False req = kc.build_kube_pod_req(pod) self._attach_live_logs_container(req) readable_req_str = readable_pod_request(req) if kc.debug: logger.info("Pod Creation Request: \n%s", readable_req_str) pod_file = task_run.task_run_attempt_file("pod.yaml") pod_file.write(readable_req_str) logger.debug("Pod Request has been saved to %s", pod_file) external_link_dict = self.build_external_links(pod) if external_link_dict: task_run.set_external_resource_urls(external_link_dict) task_run.set_task_run_state(TaskRunState.QUEUED) try: resp = self.kube_client.create_namespaced_pod( body=req, namespace=pod.metadata.namespace) logger.info("%s has been submitted at pod '%s' at namespace '%s'" % (task_run, pod.metadata.name, pod.metadata.namespace)) self.log.debug("Pod Creation Response: %s", resp) except ApiException as ex: task_run_error = TaskRunError.build_from_ex(ex, task_run) task_run.set_task_run_state(TaskRunState.FAILED, error=task_run_error) logger.error( "Exception when attempting to create Namespaced Pod using: %s", readable_req_str, ) raise if detach_run: return self self.wait() return self def _attach_live_logs_container(self, req: typing.Dict[str, typing.Any]): from dbnd_docker.kubernetes.vendorized_airflow.request_factory import ( DbndPodRequestFactory, ) DbndPodRequestFactory(self.kube_config).attach_logs_container(req) def build_external_links(self, pod: "k8s.V1Pod"): kc = self.kube_config dashboard_url = kc.get_dashboard_link(pod.metadata.namespace, pod.metadata.name) pod_log = kc.get_pod_log_link(pod.metadata.namespace, pod.metadata.name) external_link_dict = dict() if dashboard_url: external_link_dict["k8s_dashboard"] = dashboard_url if pod_log: external_link_dict["pod_log"] = pod_log return external_link_dict def is_possible_to_detach_run(self): kc = self.kube_config can_detach_run = True if kc.show_pod_log: logger.info( "%s is True, %s will send every docker in blocking mode", "show_pod_logs", kc.task_name, ) can_detach_run = False if kc.debug: logger.info( "%s is True, %s will send every docker in blocking mode", "debug", kc.task_name, ) can_detach_run = False return can_detach_run def get_pod_logs(self, tail_lines=100): try: logs = [] log_printer = lambda x: logs.append(x) self.stream_pod_logs(print_func=log_printer, tail_lines=tail_lines, follow=False) return logs except ApiException as ex: if ex.status == 404: self.log.info("failed to get log for pod: pod not found") else: self.log.exception("failed to get log: %s", ex) except Exception as ex: self.log.error("failed to get log for %s: %s", ex)