Exemple #1
0
    def execute(self, context):
        try:
            client = kube_client.get_kube_client(
                in_cluster=self.in_cluster,
                cluster_context=self.cluster_context,
                config_file=self.config_file)
            gen = pod_generator.PodGenerator()

            for port in self.ports:
                gen.add_port(port)
            for mount in self.volume_mounts:
                gen.add_mount(mount)
            for volume in self.volumes:
                gen.add_volume(volume)

            pod = gen.make_pod(
                namespace=self.namespace,
                image=self.image,
                pod_id=self.name,
                cmds=self.cmds,
                arguments=self.arguments,
                labels=self.labels,
            )

            pod.service_account_name = self.service_account_name
            pod.secrets = self.secrets
            pod.envs = self.env_vars
            pod.image_pull_policy = self.image_pull_policy
            pod.image_pull_secrets = self.image_pull_secrets
            pod.annotations = self.annotations
            pod.resources = self.resources
            pod.affinity = self.affinity
            pod.node_selectors = self.node_selectors
            pod.hostnetwork = self.hostnetwork
            pod.tolerations = self.tolerations
            pod.configmaps = self.configmaps
            pod.security_context = self.security_context
            pod.pod_runtime_info_envs = self.pod_runtime_info_envs
            pod.dnspolicy = self.dnspolicy

            launcher = pod_launcher.PodLauncher(kube_client=client,
                                                extract_xcom=self.do_xcom_push)
            try:
                (final_state, result) = launcher.run_pod(
                    pod,
                    startup_timeout=self.startup_timeout_seconds,
                    logs_connection_timeout=self.
                    logs_connection_timeout_seconds,
                    get_logs=self.get_logs)
            finally:
                if self.is_delete_operator_pod:
                    launcher.delete_pod(pod)

            if final_state != State.SUCCESS:
                raise AirflowException(
                    'Pod returned a failure: {state}'.format(
                        state=final_state))

            return result
        except AirflowException as ex:
            raise AirflowException(
                'Pod Launching failed: {error}'.format(error=ex))
 def tearDown(self) -> None:
     client = kube_client.get_kube_client(in_cluster=False)
     client.delete_collection_namespaced_pod(namespace="default")
    def execute(self, context):
        try:
            client = kube_client.get_kube_client(in_cluster=self.in_cluster,
                                                 cluster_context=self.cluster_context,
                                                 config_file=self.config_file)

            # Add Airflow Version to the label
            # And a label to identify that pod is launched by KubernetesPodOperator
            self.labels.update(
                {
                    'airflow_version': airflow_version.replace('+', '-'),
                    'kubernetes_pod_operator': 'True',
                }
            )

            pod = pod_generator.PodGenerator(
                image=self.image,
                namespace=self.namespace,
                cmds=self.cmds,
                args=self.arguments,
                labels=self.labels,
                name=self.name,
                envs=self.env_vars,
                extract_xcom=self.do_xcom_push,
                image_pull_policy=self.image_pull_policy,
                node_selectors=self.node_selectors,
                annotations=self.annotations,
                affinity=self.affinity,
                image_pull_secrets=self.image_pull_secrets,
                service_account_name=self.service_account_name,
                hostnetwork=self.hostnetwork,
                tolerations=self.tolerations,
                configmaps=self.configmaps,
                security_context=self.security_context,
                dnspolicy=self.dnspolicy,
                schedulername=self.schedulername,
                pod=self.full_pod_spec,
                init_containers=self.init_containers,
            ).gen_pod()

            pod = append_to_pod(
                pod,
                self.pod_runtime_info_envs +
                self.ports +
                self.resources +
                self.secrets +
                self.volumes +
                self.volume_mounts
            )

            self.pod = pod

            launcher = pod_launcher.PodLauncher(kube_client=client,
                                                extract_xcom=self.do_xcom_push)

            try:
                (final_state, result) = launcher.run_pod(
                    pod,
                    startup_timeout=self.startup_timeout_seconds,
                    get_logs=self.get_logs)
            finally:
                if self.is_delete_operator_pod:
                    launcher.delete_pod(pod)

            if final_state != State.SUCCESS:
                raise AirflowException(
                    'Pod returned a failure: {state}'.format(state=final_state)
                )

            return result
        except AirflowException as ex:
            raise AirflowException('Pod Launching failed: {error}'.format(error=ex))
Exemple #4
0
    def _read(self, ti, try_number, metadata=None):
        """
        Template method that contains custom logic of reading
        logs given the try_number.

        :param ti: task instance record
        :param try_number: current try_number to read log from
        :param metadata: log metadata,
                         can be used for steaming log reading and auto-tailing.
        :return: log message as a string and metadata.
        """
        # Task instance here might be different from task instance when
        # initializing the handler. Thus explicitly getting log location
        # is needed to get correct log path.
        log_relative_path = self._render_filename(ti, try_number)
        location = os.path.join(self.local_base, log_relative_path)

        log = ""

        if os.path.exists(location):
            try:
                with open(location) as file:
                    log += "*** Reading local file: {}\n".format(location)
                    log += "".join(file.readlines())
            except Exception as e:
                log = "*** Failed to load local log file: {}\n".format(
                    location)
                log += "*** {}\n".format(str(e))
        elif conf.get('core', 'executor') == 'KubernetesExecutor':
            log += '*** Trying to get logs (last 100 lines) from worker pod {} ***\n\n'\
                .format(ti.hostname)

            try:
                from airflow.kubernetes.kube_client import get_kube_client

                kube_client = get_kube_client()
                res = kube_client.read_namespaced_pod_log(
                    name=ti.hostname,
                    namespace=conf.get('kubernetes', 'namespace'),
                    container='base',
                    follow=False,
                    tail_lines=100,
                    _preload_content=False)

                for line in res:
                    log += line.decode()

            except Exception as f:  # pylint: disable=broad-except
                log += '*** Unable to fetch logs from worker pod {} ***\n{}\n\n'.format(
                    ti.hostname, str(f))
        else:
            url = os.path.join(
                "http://{ti.hostname}:{worker_log_server_port}/log",
                log_relative_path).format(ti=ti,
                                          worker_log_server_port=conf.get(
                                              'celery',
                                              'WORKER_LOG_SERVER_PORT'))
            log += "*** Log file does not exist: {}\n".format(location)
            log += "*** Fetching from: {}\n".format(url)
            try:
                timeout = None  # No timeout
                try:
                    timeout = conf.getint('webserver', 'log_fetch_timeout_sec')
                except (AirflowConfigException, ValueError):
                    pass

                response = requests.get(url, timeout=timeout)
                response.encoding = "utf-8"

                # Check if the resource was properly fetched
                response.raise_for_status()

                log += '\n' + response.text
            except Exception as e:
                log += "*** Failed to fetch log file from worker. {}\n".format(
                    str(e))

        return log, {'end_of_log': True}
    def _read(self, ti, try_number, metadata=None):
        """
        Template method that contains custom logic of reading
        logs given the try_number.

        :param ti: task instance record
        :param try_number: current try_number to read log from
        :param metadata: log metadata,
                         can be used for steaming log reading and auto-tailing.
        :return: log message as a string and metadata.
        """
        # Task instance here might be different from task instance when
        # initializing the handler. Thus explicitly getting log location
        # is needed to get correct log path.
        log_relative_path = self._render_filename(ti, try_number)
        location = os.path.join(self.local_base, log_relative_path)

        log = ""

        if os.path.exists(location):
            try:
                with open(location) as file:
                    log += f"*** Reading local file: {location}\n"
                    log += "".join(file.readlines())
            except Exception as e:
                log = f"*** Failed to load local log file: {location}\n"
                log += f"*** {str(e)}\n"
        elif conf.get('core', 'executor') == 'KubernetesExecutor':
            try:
                from airflow.kubernetes.kube_client import get_kube_client

                kube_client = get_kube_client()

                if len(ti.hostname) >= 63:
                    # Kubernetes takes the pod name and truncates it for the hostname. This truncated hostname
                    # is returned for the fqdn to comply with the 63 character limit imposed by DNS standards
                    # on any label of a FQDN.
                    pod_list = kube_client.list_namespaced_pod(
                        conf.get('kubernetes', 'namespace'))
                    matches = [
                        pod.metadata.name for pod in pod_list.items
                        if pod.metadata.name.startswith(ti.hostname)
                    ]
                    if len(matches) == 1:
                        if len(matches[0]) > len(ti.hostname):
                            ti.hostname = matches[0]

                log += '*** Trying to get logs (last 100 lines) from worker pod {} ***\n\n'.format(
                    ti.hostname)

                res = kube_client.read_namespaced_pod_log(
                    name=ti.hostname,
                    namespace=conf.get('kubernetes', 'namespace'),
                    container='base',
                    follow=False,
                    tail_lines=100,
                    _preload_content=False,
                )

                for line in res:
                    log += line.decode()

            except Exception as f:
                log += f'*** Unable to fetch logs from worker pod {ti.hostname} ***\n{str(f)}\n\n'
        else:
            url = os.path.join(
                "http://{ti.hostname}:{worker_log_server_port}/log",
                log_relative_path).format(ti=ti,
                                          worker_log_server_port=conf.get(
                                              'celery',
                                              'WORKER_LOG_SERVER_PORT'))
            log += f"*** Log file does not exist: {location}\n"
            log += f"*** Fetching from: {url}\n"
            try:
                timeout = None  # No timeout
                try:
                    timeout = conf.getint('webserver', 'log_fetch_timeout_sec')
                except (AirflowConfigException, ValueError):
                    pass

                signer = TimedJSONWebSignatureSerializer(
                    secret_key=conf.get('webserver', 'secret_key'),
                    algorithm_name='HS512',
                    expires_in=conf.getint('webserver',
                                           'log_request_clock_grace',
                                           fallback=30),
                    # This isn't really a "salt", more of a signing context
                    salt='task-instance-logs',
                )

                response = httpx.get(
                    url,
                    timeout=timeout,
                    headers={'Authorization': signer.dumps(log_relative_path)})
                response.encoding = "utf-8"

                # Check if the resource was properly fetched
                response.raise_for_status()

                log += '\n' + response.text
            except Exception as e:
                log += f"*** Failed to fetch log file from worker. {str(e)}\n"

        return log, {'end_of_log': True}
Exemple #6
0
def cleanup_pods(args):
    """Clean up k8s pods in evicted/failed/succeeded states"""
    namespace = args.namespace

    # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/
    # All Containers in the Pod have terminated in success, and will not be restarted.
    pod_succeeded = 'succeeded'

    # All Containers in the Pod have terminated, and at least one Container has terminated in failure.
    # That is, the Container either exited with non-zero status or was terminated by the system.
    pod_failed = 'failed'

    # https://kubernetes.io/docs/tasks/administer-cluster/out-of-resource/
    pod_reason_evicted = 'evicted'
    # If pod is failed and restartPolicy is:
    # * Always: Restart Container; Pod phase stays Running.
    # * OnFailure: Restart Container; Pod phase stays Running.
    # * Never: Pod phase becomes Failed.
    pod_restart_policy_never = 'never'

    print('Loading Kubernetes configuration')
    kube_client = get_kube_client()
    print(f'Listing pods in namespace {namespace}')
    airflow_pod_labels = [
        'dag_id',
        'task_id',
        'execution_date',
        'try_number',
        'airflow_version',
    ]
    list_kwargs = {
        "namespace": namespace,
        "limit": 500,
        "label_selector": ','.join(airflow_pod_labels)
    }

    while True:
        pod_list = kube_client.list_namespaced_pod(**list_kwargs)
        for pod in pod_list.items:
            pod_name = pod.metadata.name
            print(f'Inspecting pod {pod_name}')
            pod_phase = pod.status.phase.lower()
            pod_reason = pod.status.reason.lower() if pod.status.reason else ''
            pod_restart_policy = pod.spec.restart_policy.lower()

            if (pod_phase == pod_succeeded
                    or (pod_phase == pod_failed
                        and pod_restart_policy == pod_restart_policy_never)
                    or (pod_reason == pod_reason_evicted)):
                print(
                    f'Deleting pod "{pod_name}" phase "{pod_phase}" and reason "{pod_reason}", '
                    f'restart policy "{pod_restart_policy}"')
                try:
                    _delete_pod(pod.metadata.name, namespace)
                except ApiException as e:
                    print(f"Can't remove POD: {e}", file=sys.stderr)
                continue
            print(f'No action taken on pod {pod_name}')
        continue_token = pod_list.metadata._continue
        if not continue_token:
            break
        list_kwargs["_continue"] = continue_token
    def on_kill(self) -> None:
        """
        Kill Spark submit command
        """
        self.log.debug("Kill Command is being called")

        if self._should_track_driver_status:
            if self._driver_id:
                self.log.info('Killing driver %s on cluster', self._driver_id)

                kill_cmd = self._build_spark_driver_kill_command()
                driver_kill = subprocess.Popen(kill_cmd,
                                               stdout=subprocess.PIPE,
                                               stderr=subprocess.PIPE)

                self.log.info("Spark driver %s killed with return code: %s",
                              self._driver_id, driver_kill.wait())

        if self._submit_sp and self._submit_sp.poll() is None:
            self.log.info('Sending kill signal to %s',
                          self._connection['spark_binary'])
            self._submit_sp.kill()

            if self._yarn_application_id:
                kill_cmd = "yarn application -kill {}".format(
                    self._yarn_application_id).split()
                env = None
                if self._keytab is not None and self._principal is not None:
                    # we are ignoring renewal failures from renew_from_kt
                    # here as the failure could just be due to a non-renewable ticket,
                    # we still attempt to kill the yarn application
                    renew_from_kt(self._principal,
                                  self._keytab,
                                  exit_on_fail=False)
                    env = os.environ.copy()
                    env["KRB5CCNAME"] = airflow_conf.get('kerberos', 'ccache')

                yarn_kill = subprocess.Popen(kill_cmd,
                                             env=env,
                                             stdout=subprocess.PIPE,
                                             stderr=subprocess.PIPE)

                self.log.info("YARN app killed with return code: %s",
                              yarn_kill.wait())

            if self._kubernetes_driver_pod:
                self.log.info('Killing pod %s on Kubernetes',
                              self._kubernetes_driver_pod)

                # Currently only instantiate Kubernetes client for killing a spark pod.
                try:
                    import kubernetes

                    client = kube_client.get_kube_client()
                    api_response = client.delete_namespaced_pod(
                        self._kubernetes_driver_pod,
                        self._connection['namespace'],
                        body=kubernetes.client.V1DeleteOptions(),
                        pretty=True,
                    )

                    self.log.info("Spark on K8s killed with response: %s",
                                  api_response)

                except kube_client.ApiException as e:
                    self.log.error(
                        "Exception when attempting to kill Spark on K8s:")
                    self.log.exception(e)
 def test_load_file_config(self, _, _2):
     client = get_kube_client(in_cluster=False)
     assert isinstance(client.api_client.configuration,
                       RefreshConfiguration)
 def test_load_cluster_config(self, _):
     client = get_kube_client(in_cluster=True)
     assert not isinstance(client.api_client.configuration,
                           RefreshConfiguration)
    def _read(self, ti, try_number, metadata=None):
        """
        Template method that contains custom logic of reading
        logs given the try_number.

        :param ti: task instance record
        :param try_number: current try_number to read log from
        :param metadata: log metadata,
                         can be used for steaming log reading and auto-tailing.
        :return: log message as a string and metadata.
        """
        # Task instance here might be different from task instance when
        # initializing the handler. Thus explicitly getting log location
        # is needed to get correct log path.
        log_relative_path = self._render_filename(ti, try_number)
        location = os.path.join(self.local_base, log_relative_path)

        log = ""

        if os.path.exists(location):
            try:
                with open(location, encoding="utf-8",
                          errors="surrogateescape") as file:
                    log += f"*** Reading local file: {location}\n"
                    log += "".join(file.readlines())
            except Exception as e:
                log = f"*** Failed to load local log file: {location}\n"
                log += f"*** {str(e)}\n"
        elif conf.get('core', 'executor') == 'KubernetesExecutor':
            try:
                from airflow.kubernetes.kube_client import get_kube_client

                kube_client = get_kube_client()

                if len(ti.hostname) >= 63:
                    # Kubernetes takes the pod name and truncates it for the hostname. This truncated hostname
                    # is returned for the fqdn to comply with the 63 character limit imposed by DNS standards
                    # on any label of a FQDN.
                    pod_list = kube_client.list_namespaced_pod(
                        conf.get('kubernetes', 'namespace'))
                    matches = [
                        pod.metadata.name for pod in pod_list.items
                        if pod.metadata.name.startswith(ti.hostname)
                    ]
                    if len(matches) == 1:
                        if len(matches[0]) > len(ti.hostname):
                            ti.hostname = matches[0]

                log += f'*** Trying to get logs (last 100 lines) from worker pod {ti.hostname} ***\n\n'

                res = kube_client.read_namespaced_pod_log(
                    name=ti.hostname,
                    namespace=conf.get('kubernetes', 'namespace'),
                    container='base',
                    follow=False,
                    tail_lines=100,
                    _preload_content=False,
                )

                for line in res:
                    log += line.decode()

            except Exception as f:
                log += f'*** Unable to fetch logs from worker pod {ti.hostname} ***\n{str(f)}\n\n'
        else:
            import httpx

            url = os.path.join(
                "http://{ti.hostname}:{worker_log_server_port}/log",
                log_relative_path).format(ti=ti,
                                          worker_log_server_port=conf.get(
                                              'logging',
                                              'WORKER_LOG_SERVER_PORT'))
            log += f"*** Log file does not exist: {location}\n"
            log += f"*** Fetching from: {url}\n"
            try:
                timeout = None  # No timeout
                try:
                    timeout = conf.getint('webserver', 'log_fetch_timeout_sec')
                except (AirflowConfigException, ValueError):
                    pass

                signer = JWTSigner(
                    secret_key=conf.get('webserver', 'secret_key'),
                    expiration_time_in_seconds=conf.getint(
                        'webserver', 'log_request_clock_grace', fallback=30),
                    audience="task-instance-logs",
                )
                response = httpx.get(
                    url,
                    timeout=timeout,
                    headers={
                        b'Authorization':
                        signer.generate_signed_token(
                            {"filename": log_relative_path})
                    },
                )
                response.encoding = "utf-8"

                if response.status_code == 403:
                    log += (
                        "*** !!!! Please make sure that all your Airflow components (e.g. "
                        "schedulers, webservers and workers) have "
                        "the same 'secret_key' configured in 'webserver' section and "
                        "time is synchronized on all your machines (for example with ntpd) !!!!!\n***"
                    )
                    log += (
                        "*** See more at https://airflow.apache.org/docs/apache-airflow/"
                        "stable/configurations-ref.html#secret-key\n***")
                # Check if the resource was properly fetched
                response.raise_for_status()

                log += '\n' + response.text
            except Exception as e:
                log += f"*** Failed to fetch log file from worker. {str(e)}\n"

        return log, {'end_of_log': True}