def adopt_launched_task(self, kube_client, pod, pod_ids: dict): """ Patch existing pod so that the current KubernetesJobWatcher can monitor it via label selectors :param kube_client: kubernetes client for speaking to kube API :param pod: V1Pod spec that we will patch with new label :param pod_ids: pod_ids we expect to patch. """ self.log.info("attempting to adopt pod %s", pod.metadata.name) pod.metadata.labels['airflow-worker'] = str(self.scheduler_job_id) dag_id = pod.metadata.labels['dag_id'] task_id = pod.metadata.labels['task_id'] pod_id = create_pod_id(dag_id=dag_id, task_id=task_id) if pod_id not in pod_ids: self.log.error( "attempting to adopt task %s in dag %s which was not specified by database", task_id, dag_id, ) else: try: kube_client.patch_namespaced_pod( name=pod.metadata.name, namespace=pod.metadata.namespace, body=PodGenerator.serialize_pod(pod), ) pod_ids.pop(pod_id) except ApiException as e: self.log.info("Failed to adopt pod %s. Reason: %s", pod.metadata.name, e)
def run_next(self, next_job: KubernetesJobType) -> None: """ The run_next command will check the task_queue for any un-run jobs. It will then create a unique job-id, launch that job in the cluster, and store relevant info in the current_jobs map so we can track the job's status """ self.log.info('Kubernetes job is %s', str(next_job).replace("\n", " ")) key, command, kube_executor_config, pod_template_file = next_job dag_id, task_id, run_id, try_number, map_index = key if command[0:3] != ["airflow", "tasks", "run"]: raise ValueError( 'The command must start with ["airflow", "tasks", "run"].') base_worker_pod = get_base_pod_from_template(pod_template_file, self.kube_config) if not base_worker_pod: raise AirflowException( f"could not find a valid worker template yaml at {self.kube_config.pod_template_file}" ) pod = PodGenerator.construct_pod( namespace=self.namespace, scheduler_job_id=self.scheduler_job_id, pod_id=create_pod_id(dag_id, task_id), dag_id=dag_id, task_id=task_id, kube_image=self.kube_config.kube_image, try_number=try_number, map_index=map_index, date=None, run_id=run_id, args=command, pod_override_object=kube_executor_config, base_worker_pod=base_worker_pod, ) # Reconcile the pod generated by the Operator and the Pod # generated by the .cfg file self.log.debug("Kubernetes running for command %s", command) self.log.debug("Kubernetes launching image %s", pod.spec.containers[0].image) # the watcher will monitor pods, so we do not block. self.run_pod_async(pod, **self.kube_config.kube_client_request_args) self.log.debug("Kubernetes Job created!")
def try_adopt_task_instances( self, tis: List[TaskInstance]) -> List[TaskInstance]: tis_to_flush = [ti for ti in tis if not ti.external_executor_id] scheduler_job_ids = [ti.external_executor_id for ti in tis] pod_ids = { create_pod_id(dag_id=ti.dag_id, task_id=ti.task_id): ti for ti in tis if ti.external_executor_id } kube_client: client.CoreV1Api = self.kube_client for scheduler_job_id in scheduler_job_ids: kwargs = {'label_selector': f'airflow-worker={scheduler_job_id}'} pod_list = kube_client.list_namespaced_pod( namespace=self.kube_config.kube_namespace, **kwargs) for pod in pod_list.items: self.adopt_launched_task(kube_client, pod, pod_ids) self._adopt_completed_pods(kube_client) tis_to_flush.extend(pod_ids.values()) return tis_to_flush