コード例 #1
0
ファイル: kubernetes_scheduler.py プロジェクト: kalebinn/dbnd
    def __init__(
        self, kube_config, task_queue, result_queue, kube_client, worker_uuid, kube_dbnd
    ):
        super(DbndKubernetesScheduler, self).__init__(
            kube_config, task_queue, result_queue, kube_client, worker_uuid
        )
        self.kube_dbnd = kube_dbnd

        # PATCH watcher communication manager
        # we want to wait for stop, instead of "exit" inplace, so we can get all "not" received messages
        from multiprocessing.managers import SyncManager

        # TODO: why can't we use original SyncManager?
        # Scheduler <-> (via _manager) KubeWatcher
        # if _manager dies inplace, we will not get any "info" from KubeWatcher until shutdown
        self._manager = SyncManager()
        self._manager.start(mgr_init)

        self.watcher_queue = self._manager.Queue()
        self.current_resource_version = 0
        self.kube_watcher = self._make_kube_watcher_dbnd()

        # pod to airflow key (dag_id, task_id, execution_date)
        self.submitted_pods = {}  # type: Dict[str,SubmittedPodState]

        # sending data to databand tracker
        self.metrics_logger = KubernetesMetricsLogger()

        # disappeared pods mechanism
        self.last_disappeared_pods = {}
        self.current_iteration = 1
        # add `k8s-scheduler:` prefix to all log messages
        self._log = PrefixLoggerAdapter("k8s-scheduler", self.log)
コード例 #2
0
 def __init__(self):
     super(ClearZombieTaskInstancesForDagRun, self).__init__()
     self.zombie_threshold_secs = conf.getint(
         "scheduler", "scheduler_zombie_task_threshold"
     )
     self.zombie_query_interval_secs = 60
     self._last_zombie_query_time = None
     self._log = PrefixLoggerAdapter("clear-zombies", self.log)
コード例 #3
0
    def run(self):
        """
        Performs watching
        This code runs in separate process, while being forked form the main one
        Whatever clients we had in the main process they might require reset before we use them
        """
        self._log = PrefixLoggerAdapter("k8s-watcher", self.log)
        from targets.fs import reset_fs_cache

        # we are in the different process than Scheduler
        # 1. Must reset filesystem cache to avoid using out-of-cluster credentials within Kubernetes
        reset_fs_cache()
        # DBND-AIRFLOW: this code might run as part of dbnd task and
        # this process is spawn from context of the task
        # Must reset signal handlers to avoid driver and watcher sharing signal handlers
        signal.signal(signal.SIGINT, watcher_sig_handler)
        signal.signal(signal.SIGTERM, watcher_sig_handler)
        signal.signal(signal.SIGQUIT, watcher_sig_handler)

        self.log.info(
            "Event: and now my watch begins starting at resource_version: %s. Watcher PID: %s",
            self.resource_version,
            os.getpid(),
        )
        # we want a new refreshed client!
        kube_client = self.kube_dbnd.engine_config.get_kube_client()
        try:
            while True:
                try:

                    if AIRFLOW_VERSION_2:
                        job_uid = self.scheduler_job_id
                    else:
                        job_uid = self.worker_uuid

                    self.resource_version = self._run(kube_client,
                                                      self.resource_version,
                                                      job_uid,
                                                      self.kube_config)
                except DatabandSigTermError:
                    break
                except Exception:
                    self.log.exception(
                        "Unknown error in KubernetesJobWatcher. Failing")
                    raise
                else:
                    self.log.info(
                        "KubernetesWatcher restarting with resource_version: %s in %s seconds",
                        self.resource_version,
                        self.kube_dbnd.engine_config.
                        watcher_recreation_interval_seconds,
                    )
                    time.sleep(self.kube_dbnd.engine_config.
                               watcher_recreation_interval_seconds)
        except (KeyboardInterrupt, DatabandSigTermError):
            pass
コード例 #4
0
    def __init__(
        self,
        dag,
        execution_date,
        mark_success=False,
        donot_pickle=False,
        ignore_first_depends_on_past=False,
        ignore_task_deps=False,
        fail_fast=True,
        pool=None,
        delay_on_limit_secs=1.0,
        verbose=False,
        airflow_config=None,
        *args,
        **kwargs
    ):
        self.dag = dag
        self.dag_id = dag.dag_id
        self.execution_date = execution_date
        self.mark_success = mark_success
        self.donot_pickle = donot_pickle
        self.ignore_first_depends_on_past = ignore_first_depends_on_past
        self.ignore_task_deps = ignore_task_deps
        self.fail_fast = fail_fast
        self.pool = pool
        self.delay_on_limit_secs = delay_on_limit_secs
        self.verbose = verbose

        self.terminating = False

        super(SingleDagRunJob, self).__init__(*args, **kwargs)

        self._logged_count = 0  # counter for status update
        self._logged_status = ""  # last printed status

        self.ti_state_manager = AirflowTaskInstanceStateManager()
        self.airflow_config = airflow_config  # type: AirflowConfig
        if (
            self.airflow_config.clean_zombie_task_instances
            and "KubernetesExecutor" in self.executor_class
        ):
            self._runtime_k8s_zombie_cleaner = ClearKubernetesRuntimeZombiesForDagRun(
                k8s_executor=self.executor
            )
            logger.info(
                "Zombie cleaner is enabled. "
                "It runs every %s seconds, threshold is %s seconds",
                self._runtime_k8s_zombie_cleaner.zombie_query_interval_secs,
                self._runtime_k8s_zombie_cleaner.zombie_threshold_secs,
            )
        else:
            self._runtime_k8s_zombie_cleaner = None
        self._log = PrefixLoggerAdapter("scheduler", self.log)
コード例 #5
0
    def __init__(self, kube_dbnd=None):
        # type: (DbndKubernetesExecutor, DbndKubernetesClient) -> None
        from os import environ

        # This env variable is required for airflow's kubernetes configuration validation
        environ["AIRFLOW__KUBERNETES__DAGS_IN_IMAGE"] = "True"
        super(DbndKubernetesExecutor, self).__init__()

        self.kube_dbnd = kube_dbnd
        _update_airflow_kube_config(airflow_kube_config=self.kube_config,
                                    engine_config=kube_dbnd.engine_config)

        self._log = PrefixLoggerAdapter("k8s-executor", self.log)
コード例 #6
0
    def __init__(self, k8s_executor):
        super(ClearKubernetesRuntimeZombiesForDagRun, self).__init__()

        self._last_zombie_query_time = None
        self.k8s_executor = k8s_executor  # type: DbndKubernetesExecutor

        # time configurations
        self.zombie_threshold_secs = (
            k8s_executor.kube_dbnd.engine_config.zombie_threshold_secs)
        self.zombie_query_interval_secs = (
            k8s_executor.kube_dbnd.engine_config.zombie_query_interval_secs)
        self._pending_zombies_timeout = (
            k8s_executor.kube_dbnd.engine_config.pending_zombies_timeout)

        self._log = PrefixLoggerAdapter("clear-zombies", self.log)
コード例 #7
0
ファイル: kube_dbnd_client.py プロジェクト: databand-ai/dbnd
 def __init__(self, pod_name, pod_namespace, kube_config, kube_client):
     self.kube_config = kube_config  # type: KubernetesEngineConfig
     self.name = pod_name
     self.namespace = pod_namespace
     self.kube_client = kube_client
     self.log = PrefixLoggerAdapter("pod %s" % self.name, logger)
コード例 #8
0
ファイル: kube_dbnd_client.py プロジェクト: databand-ai/dbnd
class DbndPodCtrl(object):
    def __init__(self, pod_name, pod_namespace, kube_config, kube_client):
        self.kube_config = kube_config  # type: KubernetesEngineConfig
        self.name = pod_name
        self.namespace = pod_namespace
        self.kube_client = kube_client
        self.log = PrefixLoggerAdapter("pod %s" % self.name, logger)

    def delete_pod(self):
        if self.kube_config.keep_finished_pods:
            self.log.warning(
                "Will not delete pod due to keep_finished_pods=True.")
            return

        if self.kube_config.keep_failed_pods:
            pod_phase = self.get_pod_phase()
            if pod_phase not in {PodPhase.RUNNING, PodPhase.SUCCEEDED}:
                self.log.warning(
                    "Keeping failed pod due to keep_failed_pods=True and state is %s",
                    pod_phase,
                )
                return

        try:
            self.kube_client.delete_namespaced_pod(
                self.name, self.namespace, body=client.V1DeleteOptions())
            self.log.info("Pod has been deleted.")
        except ApiException as e:
            self.log.info("Failed to delete pod: %s",
                          e if e.status != 404 else "pod not found")
            # If the pod is already deleted, don't raise
            # if e.status != 404:
            #     raise

    def get_pod_status_v1(self):
        # type: () -> Optional[V1Pod]
        try:
            return self.kube_client.read_namespaced_pod(
                name=self.name, namespace=self.namespace)
        except ApiException as e:
            # If the pod can not be found
            if e.status == 404:
                return None
            raise

    def get_pod_phase(self):
        pod_resp = self.get_pod_status_v1()
        if not pod_resp:
            return None

        return pod_resp.status.phase

    def _wait_for_pod_started(self, _logger=None):
        """
        will try to raise an exception if the pod fails to start (see DbndPodLauncher.check_deploy_errors)
        """
        _logger = _logger or self.log
        start_time = datetime.now()
        while True:
            pod_status = self.get_pod_status_v1()
            if not pod_status:
                raise DatabandError("Can not find pod at k8s:%s")
            # PATCH:  validate deploy errors
            self.check_deploy_errors(pod_status)

            pod_phase = pod_status.status.phase
            if pod_phase.lower() != PodStatus.PENDING:
                return

            startup_delta = datetime.now() - start_time
            if startup_delta >= self.kube_config.startup_timeout:
                raise DatabandError("Pod is still not running after %s" %
                                    startup_delta)
            time.sleep(1)
            _logger.debug("Pod not yet started: %s", pod_status.status)

    def stream_pod_logs(self,
                        print_func=logger.info,
                        follow=False,
                        tail_lines=None):
        kwargs = {
            "name": self.name,
            "namespace": self.namespace,
            "container": "base",
            "follow": follow,
            "_preload_content": False,
        }
        if tail_lines:
            kwargs["tail_lines"] = tail_lines

        logs = self.kube_client.read_namespaced_pod_log(**kwargs)
        try:
            if self.kube_config.prefix_remote_log:
                # we want to remove regular header in log, and make it looks like '[pod_name] LOG FROM POD'
                prefix = "[%s]" % self.name
                with override_log_formatting(prefix + "%(message)s"):
                    for line in logs:
                        print_func(line[:-1].decode("utf-8"))
            else:
                for line in logs:
                    print_func(line[:-1].decode("utf-8"))
        except Exception as ex:
            self.log.error("Failed to stream logs:  %s", self.name, ex)

    def check_deploy_errors(self, pod_v1_resp):
        pod_status = pod_v1_resp.status
        if self.kube_config.check_unschedulable_condition and pod_status.conditions:
            for condition in pod_status.conditions:
                if condition.reason != "Unschedulable":
                    continue
                logger.info("pod is pending because %s" % condition.message)
                if ("Insufficient cpu" in condition.message
                        or "Insufficient memory" in condition.message):
                    if self.kube_config.check_cluster_resource_capacity:
                        kube_resources_checker = DbndKubeResourcesChecker(
                            kube_client=self.kube_client,
                            kube_config=self.kube_config)
                        kube_resources_checker.check_if_resource_request_above_max_capacity(
                            condition.message)

                    self.log.warning("pod is pending because %s" %
                                     condition.message)
                else:
                    raise friendly_error.executor_k8s.kubernetes_pod_unschedulable(
                        condition.message)

        if pod_status.container_statuses:
            container_waiting_state = pod_status.container_statuses[
                0].state.waiting
            if (self.kube_config.check_image_pull_errors
                    and pod_status.phase == "Pending"
                    and container_waiting_state):
                if container_waiting_state.reason == "ErrImagePull":
                    raise friendly_error.executor_k8s.kubernetes_image_not_found(
                        pod_status.container_statuses[0].image,
                        container_waiting_state.message,
                        long_msg=container_waiting_state.reason,
                    )

                if container_waiting_state.reason == "CreateContainerConfigError":
                    raise friendly_error.executor_k8s.kubernetes_pod_config_error(
                        container_waiting_state.message)

    def check_running_errors(self, pod_v1_resp):
        """
        Raise an error if pod in running state with Failed conditions
        """
        pod_status = pod_v1_resp.status
        if not self.kube_config.check_running_pod_errors:
            return
        if pod_status.conditions:
            for condition in pod_status.conditions:
                if condition.type != "Ready":
                    continue
                # We are looking for
                #  {
                #   u"status": u"False",
                #   u"lastProbeTime": None,
                #   u"type": u"Ready",
                #   u"lastTransitionTime": u"2021-01-22T04:54:13Z",
                #  },
                if not condition.status or condition.status == "False":
                    raise friendly_error.executor_k8s.kubernetes_running_pod_fails_on_condition(
                        condition, pod_name=pod_v1_resp.metadata.name)
                return True
        return False

    def wait(self):
        """
        Waits for pod completion
        :return:
        """
        self._wait_for_pod_started()
        self.log.info("Pod is running, reading logs..")
        self.stream_pod_logs(follow=True)
        self.log.info("Successfully read pod logs")

        pod_phase = self.get_pod_phase()
        wait_start = utcnow()
        while pod_phase not in {PodPhase.SUCCEEDED, PodPhase.FAILED}:
            logger.debug(
                "Pod '%s' is not completed with state %s, waiting..",
                self.name,
                pod_phase,
            )
            if (utcnow() - wait_start
                ) > self.kube_config.submit_termination_grace_period:
                raise DatabandRuntimeError(
                    "Pod is not in a final state after {grace_period}: {state}"
                    .format(
                        grace_period=self.kube_config.
                        submit_termination_grace_period,
                        state=pod_phase,
                    ))
            time.sleep(5)
            pod_phase = self.get_pod_phase()

        if pod_phase != PodPhase.SUCCEEDED:
            raise DatabandRuntimeError(
                "Pod returned a failure: {pod_phase}".format(
                    pod_phase=pod_phase))
        return self

    def run_pod(self,
                task_run: "TaskRun",
                pod: "k8s.V1Pod",
                detach_run: bool = False) -> "DbndPodCtrl":
        kc = self.kube_config
        detach_run = detach_run or kc.detach_run
        if not self.is_possible_to_detach_run():
            detach_run = False

        req = kc.build_kube_pod_req(pod)
        self._attach_live_logs_container(req)

        readable_req_str = readable_pod_request(req)

        if kc.debug:
            logger.info("Pod Creation Request: \n%s", readable_req_str)
            pod_file = task_run.task_run_attempt_file("pod.yaml")
            pod_file.write(readable_req_str)
            logger.debug("Pod Request has been saved to %s", pod_file)

        external_link_dict = self.build_external_links(pod)
        if external_link_dict:
            task_run.set_external_resource_urls(external_link_dict)

        task_run.set_task_run_state(TaskRunState.QUEUED)

        try:
            resp = self.kube_client.create_namespaced_pod(
                body=req, namespace=pod.metadata.namespace)
            logger.info("%s has been submitted at pod '%s' at namespace '%s'" %
                        (task_run, pod.metadata.name, pod.metadata.namespace))
            self.log.debug("Pod Creation Response: %s", resp)
        except ApiException as ex:
            task_run_error = TaskRunError.build_from_ex(ex, task_run)
            task_run.set_task_run_state(TaskRunState.FAILED,
                                        error=task_run_error)
            logger.error(
                "Exception when attempting to create Namespaced Pod using: %s",
                readable_req_str,
            )
            raise

        if detach_run:
            return self

        self.wait()
        return self

    def _attach_live_logs_container(self, req: typing.Dict[str, typing.Any]):
        from dbnd_docker.kubernetes.vendorized_airflow.request_factory import (
            DbndPodRequestFactory, )

        DbndPodRequestFactory(self.kube_config).attach_logs_container(req)

    def build_external_links(self, pod: "k8s.V1Pod"):
        kc = self.kube_config
        dashboard_url = kc.get_dashboard_link(pod.metadata.namespace,
                                              pod.metadata.name)
        pod_log = kc.get_pod_log_link(pod.metadata.namespace,
                                      pod.metadata.name)
        external_link_dict = dict()
        if dashboard_url:
            external_link_dict["k8s_dashboard"] = dashboard_url
        if pod_log:
            external_link_dict["pod_log"] = pod_log
        return external_link_dict

    def is_possible_to_detach_run(self):
        kc = self.kube_config
        can_detach_run = True
        if kc.show_pod_log:
            logger.info(
                "%s is True, %s will send every docker in blocking mode",
                "show_pod_logs",
                kc.task_name,
            )
            can_detach_run = False
        if kc.debug:
            logger.info(
                "%s is True, %s will send every docker in blocking mode",
                "debug",
                kc.task_name,
            )
            can_detach_run = False
        return can_detach_run

    def get_pod_logs(self, tail_lines=100):
        try:
            logs = []
            log_printer = lambda x: logs.append(x)
            self.stream_pod_logs(print_func=log_printer,
                                 tail_lines=tail_lines,
                                 follow=False)
            return logs
        except ApiException as ex:
            if ex.status == 404:
                self.log.info("failed to get log for pod: pod not found")
            else:
                self.log.exception("failed to get log: %s", ex)
        except Exception as ex:
            self.log.error("failed to get log for %s: %s", ex)