def execute(self, context): try: client = kube_client.get_kube_client( in_cluster=self.in_cluster, cluster_context=self.cluster_context, config_file=self.config_file) gen = pod_generator.PodGenerator() for port in self.ports: gen.add_port(port) for mount in self.volume_mounts: gen.add_mount(mount) for volume in self.volumes: gen.add_volume(volume) pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.arguments, labels=self.labels, ) pod.service_account_name = self.service_account_name pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy pod.image_pull_secrets = self.image_pull_secrets pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity pod.node_selectors = self.node_selectors pod.hostnetwork = self.hostnetwork pod.tolerations = self.tolerations pod.configmaps = self.configmaps pod.security_context = self.security_context pod.pod_runtime_info_envs = self.pod_runtime_info_envs pod.dnspolicy = self.dnspolicy launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.do_xcom_push) try: (final_state, result) = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, logs_connection_timeout=self. logs_connection_timeout_seconds, get_logs=self.get_logs) finally: if self.is_delete_operator_pod: launcher.delete_pod(pod) if final_state != State.SUCCESS: raise AirflowException( 'Pod returned a failure: {state}'.format( state=final_state)) return result except AirflowException as ex: raise AirflowException( 'Pod Launching failed: {error}'.format(error=ex))
def tearDown(self) -> None: client = kube_client.get_kube_client(in_cluster=False) client.delete_collection_namespaced_pod(namespace="default")
def execute(self, context): try: client = kube_client.get_kube_client(in_cluster=self.in_cluster, cluster_context=self.cluster_context, config_file=self.config_file) # Add Airflow Version to the label # And a label to identify that pod is launched by KubernetesPodOperator self.labels.update( { 'airflow_version': airflow_version.replace('+', '-'), 'kubernetes_pod_operator': 'True', } ) pod = pod_generator.PodGenerator( image=self.image, namespace=self.namespace, cmds=self.cmds, args=self.arguments, labels=self.labels, name=self.name, envs=self.env_vars, extract_xcom=self.do_xcom_push, image_pull_policy=self.image_pull_policy, node_selectors=self.node_selectors, annotations=self.annotations, affinity=self.affinity, image_pull_secrets=self.image_pull_secrets, service_account_name=self.service_account_name, hostnetwork=self.hostnetwork, tolerations=self.tolerations, configmaps=self.configmaps, security_context=self.security_context, dnspolicy=self.dnspolicy, schedulername=self.schedulername, pod=self.full_pod_spec, init_containers=self.init_containers, ).gen_pod() pod = append_to_pod( pod, self.pod_runtime_info_envs + self.ports + self.resources + self.secrets + self.volumes + self.volume_mounts ) self.pod = pod launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.do_xcom_push) try: (final_state, result) = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs) finally: if self.is_delete_operator_pod: launcher.delete_pod(pod) if final_state != State.SUCCESS: raise AirflowException( 'Pod returned a failure: {state}'.format(state=final_state) ) return result except AirflowException as ex: raise AirflowException('Pod Launching failed: {error}'.format(error=ex))
def _read(self, ti, try_number, metadata=None): """ Template method that contains custom logic of reading logs given the try_number. :param ti: task instance record :param try_number: current try_number to read log from :param metadata: log metadata, can be used for steaming log reading and auto-tailing. :return: log message as a string and metadata. """ # Task instance here might be different from task instance when # initializing the handler. Thus explicitly getting log location # is needed to get correct log path. log_relative_path = self._render_filename(ti, try_number) location = os.path.join(self.local_base, log_relative_path) log = "" if os.path.exists(location): try: with open(location) as file: log += "*** Reading local file: {}\n".format(location) log += "".join(file.readlines()) except Exception as e: log = "*** Failed to load local log file: {}\n".format( location) log += "*** {}\n".format(str(e)) elif conf.get('core', 'executor') == 'KubernetesExecutor': log += '*** Trying to get logs (last 100 lines) from worker pod {} ***\n\n'\ .format(ti.hostname) try: from airflow.kubernetes.kube_client import get_kube_client kube_client = get_kube_client() res = kube_client.read_namespaced_pod_log( name=ti.hostname, namespace=conf.get('kubernetes', 'namespace'), container='base', follow=False, tail_lines=100, _preload_content=False) for line in res: log += line.decode() except Exception as f: # pylint: disable=broad-except log += '*** Unable to fetch logs from worker pod {} ***\n{}\n\n'.format( ti.hostname, str(f)) else: url = os.path.join( "http://{ti.hostname}:{worker_log_server_port}/log", log_relative_path).format(ti=ti, worker_log_server_port=conf.get( 'celery', 'WORKER_LOG_SERVER_PORT')) log += "*** Log file does not exist: {}\n".format(location) log += "*** Fetching from: {}\n".format(url) try: timeout = None # No timeout try: timeout = conf.getint('webserver', 'log_fetch_timeout_sec') except (AirflowConfigException, ValueError): pass response = requests.get(url, timeout=timeout) response.encoding = "utf-8" # Check if the resource was properly fetched response.raise_for_status() log += '\n' + response.text except Exception as e: log += "*** Failed to fetch log file from worker. {}\n".format( str(e)) return log, {'end_of_log': True}
def _read(self, ti, try_number, metadata=None): """ Template method that contains custom logic of reading logs given the try_number. :param ti: task instance record :param try_number: current try_number to read log from :param metadata: log metadata, can be used for steaming log reading and auto-tailing. :return: log message as a string and metadata. """ # Task instance here might be different from task instance when # initializing the handler. Thus explicitly getting log location # is needed to get correct log path. log_relative_path = self._render_filename(ti, try_number) location = os.path.join(self.local_base, log_relative_path) log = "" if os.path.exists(location): try: with open(location) as file: log += f"*** Reading local file: {location}\n" log += "".join(file.readlines()) except Exception as e: log = f"*** Failed to load local log file: {location}\n" log += f"*** {str(e)}\n" elif conf.get('core', 'executor') == 'KubernetesExecutor': try: from airflow.kubernetes.kube_client import get_kube_client kube_client = get_kube_client() if len(ti.hostname) >= 63: # Kubernetes takes the pod name and truncates it for the hostname. This truncated hostname # is returned for the fqdn to comply with the 63 character limit imposed by DNS standards # on any label of a FQDN. pod_list = kube_client.list_namespaced_pod( conf.get('kubernetes', 'namespace')) matches = [ pod.metadata.name for pod in pod_list.items if pod.metadata.name.startswith(ti.hostname) ] if len(matches) == 1: if len(matches[0]) > len(ti.hostname): ti.hostname = matches[0] log += '*** Trying to get logs (last 100 lines) from worker pod {} ***\n\n'.format( ti.hostname) res = kube_client.read_namespaced_pod_log( name=ti.hostname, namespace=conf.get('kubernetes', 'namespace'), container='base', follow=False, tail_lines=100, _preload_content=False, ) for line in res: log += line.decode() except Exception as f: log += f'*** Unable to fetch logs from worker pod {ti.hostname} ***\n{str(f)}\n\n' else: url = os.path.join( "http://{ti.hostname}:{worker_log_server_port}/log", log_relative_path).format(ti=ti, worker_log_server_port=conf.get( 'celery', 'WORKER_LOG_SERVER_PORT')) log += f"*** Log file does not exist: {location}\n" log += f"*** Fetching from: {url}\n" try: timeout = None # No timeout try: timeout = conf.getint('webserver', 'log_fetch_timeout_sec') except (AirflowConfigException, ValueError): pass signer = TimedJSONWebSignatureSerializer( secret_key=conf.get('webserver', 'secret_key'), algorithm_name='HS512', expires_in=conf.getint('webserver', 'log_request_clock_grace', fallback=30), # This isn't really a "salt", more of a signing context salt='task-instance-logs', ) response = httpx.get( url, timeout=timeout, headers={'Authorization': signer.dumps(log_relative_path)}) response.encoding = "utf-8" # Check if the resource was properly fetched response.raise_for_status() log += '\n' + response.text except Exception as e: log += f"*** Failed to fetch log file from worker. {str(e)}\n" return log, {'end_of_log': True}
def cleanup_pods(args): """Clean up k8s pods in evicted/failed/succeeded states""" namespace = args.namespace # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/ # All Containers in the Pod have terminated in success, and will not be restarted. pod_succeeded = 'succeeded' # All Containers in the Pod have terminated, and at least one Container has terminated in failure. # That is, the Container either exited with non-zero status or was terminated by the system. pod_failed = 'failed' # https://kubernetes.io/docs/tasks/administer-cluster/out-of-resource/ pod_reason_evicted = 'evicted' # If pod is failed and restartPolicy is: # * Always: Restart Container; Pod phase stays Running. # * OnFailure: Restart Container; Pod phase stays Running. # * Never: Pod phase becomes Failed. pod_restart_policy_never = 'never' print('Loading Kubernetes configuration') kube_client = get_kube_client() print(f'Listing pods in namespace {namespace}') airflow_pod_labels = [ 'dag_id', 'task_id', 'execution_date', 'try_number', 'airflow_version', ] list_kwargs = { "namespace": namespace, "limit": 500, "label_selector": ','.join(airflow_pod_labels) } while True: pod_list = kube_client.list_namespaced_pod(**list_kwargs) for pod in pod_list.items: pod_name = pod.metadata.name print(f'Inspecting pod {pod_name}') pod_phase = pod.status.phase.lower() pod_reason = pod.status.reason.lower() if pod.status.reason else '' pod_restart_policy = pod.spec.restart_policy.lower() if (pod_phase == pod_succeeded or (pod_phase == pod_failed and pod_restart_policy == pod_restart_policy_never) or (pod_reason == pod_reason_evicted)): print( f'Deleting pod "{pod_name}" phase "{pod_phase}" and reason "{pod_reason}", ' f'restart policy "{pod_restart_policy}"') try: _delete_pod(pod.metadata.name, namespace) except ApiException as e: print(f"Can't remove POD: {e}", file=sys.stderr) continue print(f'No action taken on pod {pod_name}') continue_token = pod_list.metadata._continue if not continue_token: break list_kwargs["_continue"] = continue_token
def on_kill(self) -> None: """ Kill Spark submit command """ self.log.debug("Kill Command is being called") if self._should_track_driver_status: if self._driver_id: self.log.info('Killing driver %s on cluster', self._driver_id) kill_cmd = self._build_spark_driver_kill_command() driver_kill = subprocess.Popen(kill_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.log.info("Spark driver %s killed with return code: %s", self._driver_id, driver_kill.wait()) if self._submit_sp and self._submit_sp.poll() is None: self.log.info('Sending kill signal to %s', self._connection['spark_binary']) self._submit_sp.kill() if self._yarn_application_id: kill_cmd = "yarn application -kill {}".format( self._yarn_application_id).split() env = None if self._keytab is not None and self._principal is not None: # we are ignoring renewal failures from renew_from_kt # here as the failure could just be due to a non-renewable ticket, # we still attempt to kill the yarn application renew_from_kt(self._principal, self._keytab, exit_on_fail=False) env = os.environ.copy() env["KRB5CCNAME"] = airflow_conf.get('kerberos', 'ccache') yarn_kill = subprocess.Popen(kill_cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.log.info("YARN app killed with return code: %s", yarn_kill.wait()) if self._kubernetes_driver_pod: self.log.info('Killing pod %s on Kubernetes', self._kubernetes_driver_pod) # Currently only instantiate Kubernetes client for killing a spark pod. try: import kubernetes client = kube_client.get_kube_client() api_response = client.delete_namespaced_pod( self._kubernetes_driver_pod, self._connection['namespace'], body=kubernetes.client.V1DeleteOptions(), pretty=True, ) self.log.info("Spark on K8s killed with response: %s", api_response) except kube_client.ApiException as e: self.log.error( "Exception when attempting to kill Spark on K8s:") self.log.exception(e)
def test_load_file_config(self, _, _2): client = get_kube_client(in_cluster=False) assert isinstance(client.api_client.configuration, RefreshConfiguration)
def test_load_cluster_config(self, _): client = get_kube_client(in_cluster=True) assert not isinstance(client.api_client.configuration, RefreshConfiguration)
def _read(self, ti, try_number, metadata=None): """ Template method that contains custom logic of reading logs given the try_number. :param ti: task instance record :param try_number: current try_number to read log from :param metadata: log metadata, can be used for steaming log reading and auto-tailing. :return: log message as a string and metadata. """ # Task instance here might be different from task instance when # initializing the handler. Thus explicitly getting log location # is needed to get correct log path. log_relative_path = self._render_filename(ti, try_number) location = os.path.join(self.local_base, log_relative_path) log = "" if os.path.exists(location): try: with open(location, encoding="utf-8", errors="surrogateescape") as file: log += f"*** Reading local file: {location}\n" log += "".join(file.readlines()) except Exception as e: log = f"*** Failed to load local log file: {location}\n" log += f"*** {str(e)}\n" elif conf.get('core', 'executor') == 'KubernetesExecutor': try: from airflow.kubernetes.kube_client import get_kube_client kube_client = get_kube_client() if len(ti.hostname) >= 63: # Kubernetes takes the pod name and truncates it for the hostname. This truncated hostname # is returned for the fqdn to comply with the 63 character limit imposed by DNS standards # on any label of a FQDN. pod_list = kube_client.list_namespaced_pod( conf.get('kubernetes', 'namespace')) matches = [ pod.metadata.name for pod in pod_list.items if pod.metadata.name.startswith(ti.hostname) ] if len(matches) == 1: if len(matches[0]) > len(ti.hostname): ti.hostname = matches[0] log += f'*** Trying to get logs (last 100 lines) from worker pod {ti.hostname} ***\n\n' res = kube_client.read_namespaced_pod_log( name=ti.hostname, namespace=conf.get('kubernetes', 'namespace'), container='base', follow=False, tail_lines=100, _preload_content=False, ) for line in res: log += line.decode() except Exception as f: log += f'*** Unable to fetch logs from worker pod {ti.hostname} ***\n{str(f)}\n\n' else: import httpx url = os.path.join( "http://{ti.hostname}:{worker_log_server_port}/log", log_relative_path).format(ti=ti, worker_log_server_port=conf.get( 'logging', 'WORKER_LOG_SERVER_PORT')) log += f"*** Log file does not exist: {location}\n" log += f"*** Fetching from: {url}\n" try: timeout = None # No timeout try: timeout = conf.getint('webserver', 'log_fetch_timeout_sec') except (AirflowConfigException, ValueError): pass signer = JWTSigner( secret_key=conf.get('webserver', 'secret_key'), expiration_time_in_seconds=conf.getint( 'webserver', 'log_request_clock_grace', fallback=30), audience="task-instance-logs", ) response = httpx.get( url, timeout=timeout, headers={ b'Authorization': signer.generate_signed_token( {"filename": log_relative_path}) }, ) response.encoding = "utf-8" if response.status_code == 403: log += ( "*** !!!! Please make sure that all your Airflow components (e.g. " "schedulers, webservers and workers) have " "the same 'secret_key' configured in 'webserver' section and " "time is synchronized on all your machines (for example with ntpd) !!!!!\n***" ) log += ( "*** See more at https://airflow.apache.org/docs/apache-airflow/" "stable/configurations-ref.html#secret-key\n***") # Check if the resource was properly fetched response.raise_for_status() log += '\n' + response.text except Exception as e: log += f"*** Failed to fetch log file from worker. {str(e)}\n" return log, {'end_of_log': True}