def clean_up(self, job_name): """ Deletes the job. Deleting the job deletes are related pods. """ logging.info("KILLING job {}".format(str(job_name))) result = retryable_check_output(args=namespaced_kubectl() + [ 'delete', '--ignore-not-found=true', # in case we hit an edge case on retry 'job', job_name ]) logging.info(result)
def get_pods(job_name): """ Return pods for a given job_name :param job_name: a unique job name :type job_name: string :return: result form kubectl command that contains job pod and container information :type: dict """ return json.loads( retryable_check_output( args=namespaced_kubectl() + ['get', 'pods', '-o', 'json', '-l', 'job-name==%s' % job_name]))
def log_container_logs(self, job_name, pod_output=None): """ Reads the logs from each container in each pod in the job, re-logs them back :param job_name: job that owns the pods with the containers we want to log :param pod_output: Result of get_pods(job_name) call. If None, will be requested. This is a convenience so we can share/ reuse the results of get_pods() :return: """ pod_output = pod_output or self.get_pods(job_name) for pod in pod_output['items']: pod_name = pod['metadata']['name'] for container in pod['spec']['containers']: container_name = container['name'] extra = dict(pod=pod_name, container=container_name) logging.info('LOGGING OUTPUT FROM JOB [%s/%s]:' % (pod_name, container_name), extra=extra) output = retryable_check_output( args=namespaced_kubectl() + ['logs', pod_name, container_name]) for line in output.splitlines(): logging.info(line, extra=extra)
def execute(self, context, session=None): if self.die_if_duplicate: current_task_instance = TaskInstance(self, context['execution_date']) current_task_instance.refresh_from_db(include_queue_time=True) TI = TaskInstance instances_that_are_running = session.query(TI).filter( TI.dag_id == current_task_instance.dag_id, TI.task_id == current_task_instance.task_id, TI.state.in_([State.RUNNING, State.UP_FOR_RETRY, State.QUEUED]), ).all() should_die = False for task_instance in instances_that_are_running: if task_instance.queued_dttm < current_task_instance.queued_dttm: should_die = True break if should_die: raise Exception( "A prior execution of this task is already running! Failing this execution." ) job_name, job_yaml_string = self.create_job_yaml(context) try: # Setting pod_output to None, this will prevent a log_container_logs error # if polling fails and self.polling_job_completion is not able to return pod_output. pod_output = None logging.info(job_yaml_string) self.instance_names.append( job_name) # should happen once, but safety first! self.xcom_push(context, "kubernetes_job_name", job_name) with tempfile.NamedTemporaryFile(suffix='.yaml') as f: f.write(job_yaml_string) f.flush() result = subprocess.check_output(args=namespaced_kubectl() + ['apply', '-f', f.name]) logging.info(result) pod_output = self.poll_job_completion(job_name) pod_output = pod_output or self.get_pods( job_name) # if we didn't get it for some reason return None finally: try: # don't consider the job failed if this fails! self.log_container_logs(job_name, pod_output=pod_output) stacktrace = traceback.format_exc().rstrip() if stacktrace != "None": logging.error( "Got an exception during airflow worker execution! Stack trace:\n{}" .format(traceback)) if pod_output: # let's clean up all our old pods. we'll kill the entry point (PID 1) in each running container for pod in pod_output.get('items', []): # if we never got to running, there won't be containerStatuses if 'containerStatuses' in pod['status']: live_containers = [ cs['name'] for cs in pod['status']['containerStatuses'] if 'running' in cs['state'] ] for cname in live_containers: logging.info( 'killing dependent live container %s' % cname) # there is a race condition between reading the status and trying to kill the running # container. ignore the return code to duck the issue. subprocess.call(namespaced_kubectl() + [ 'exec', pod['metadata']['name'], '-c', cname, 'kill', '1' ]) except Exception as ex: logging.error("Failed to clean up kubernetes job:\n%s" % traceback.format_exc(), extra={'err': ex}) self.clean_up(job_name)
def poll_job_completion(self, job_name, dependent_containers={'cloudsql-proxy'}): """ Polls for completion of the created job. Sleeps for sleep_seconds_between_polling between polling. Any failed pods will raise an error and fail the KubernetesJobOperator task. """ logging.info('Polling for completion of job: %s' % job_name) pod_output = None # keeping this out here so we can reuse it in the "finally" clause has_live_existed = False while True: time.sleep(self.sleep_seconds_between_polling) pod_output = self.get_pods(job_name) job_description = json.loads( retryable_check_output(namespaced_kubectl() + ['get', 'job', "-o", "json", job_name])) status_block = job_description['status'] if 'succeeded' in status_block and 'failed' in status_block: raise Exception( "Invalid status block containing both succeeded and failed: %s", json.dumps(status_block)) if 'active' in status_block: status = 'running' elif 'failed' in status_block: status = "failed" elif 'succeeded' in status_block: status = 'complete' else: status = "pending" logging.info('Current status is: %s' % status) if "pending" == status: pass if "failed" == status: raise Exception('%s has failed pods, failing task.' % job_name) if "complete" == status: return pod_output # Determine if we have any containers left running in each pod of the job. # Dependent containers don't count. # If there are no pods left running anything, we are done here. Cleaning up # dependent containers will be left to the top-level `finally` block down below. has_live = False for pod in pod_output['items']: if 'Unknown' == pod['status']['phase']: # we haven't run yet has_live = True break elif 'Pending' == pod['status']['phase']: has_live = True start_time_s = pod['status'].get('startTime') if not start_time_s: logging.info('Pod not yet started') break start_time = datetime.strptime(start_time_s, "%Y-%m-%dT%H:%M:%SZ") start_duration_secs = (datetime.utcnow() - start_time).total_seconds() if start_duration_secs > 300: raise Exception( '%s has failed to start after %0.2f seconds' % ( job_name, start_duration_secs, )) elif 'Running' == pod['status']['phase']: # get all of the independent containers that are still alive (running or waiting) live_cnt = 0 for cs in pod['status']['containerStatuses']: if cs['name'] in dependent_containers: pass elif 'terminated' in cs['state']: has_live_existed = True exit_code = int(cs['state']['terminated'].get( 'exitCode', 0)) if exit_code > 0: raise Exception( '%s has failed pods, failing task.' % job_name) else: live_cnt += 1 if live_cnt > 0: has_live = True break elif 'Succeeded' == pod['status']['phase']: # For us to end up in this block, the job has to be Running and the pod has to be Succeeded. # This happens when (on a previous attempt) we successfully finished execution, killed dependent # containers, and failed to delete the job. # In this scenario, we want to immediately stop polling, and retry job deletion. has_live_existed = True has_live = False elif 'Failed' == pod['status']['phase']: raise Exception("Containers failed!") else: raise Exception( "Encountered pod state {state} - no behavior has been prepared for pods in this state!" .format(state=pod["status"]["phase"])) total_pods = len(pod_output['items']) logging.info( "total pods: {total_pods}".format(total_pods=total_pods)) has_live_existed = has_live_existed or has_live # if we get to this point but for some reason there are no pods, log it and retry if not has_live_existed: logging.info('No pods have run. Retrying.') # we have no live pods, but live pods have existed. elif not has_live: logging.info('No live, independent pods left.') return pod_output
def get_pods(self, job_name): return json.loads( retryable_check_output( args=namespaced_kubectl() + ['get', 'pods', '-o', 'json', '-l', 'job-name==%s' % job_name]))
def get_hostname(self, ti): return json.loads( subprocess.check_output( namespaced_kubectl() + ["get", "-o", "json", "pod/{}".format(ti.hostname)]) )['status']['podIP']