Ejemplo n.º 1
0
 def clean_up(self, job_name):
     """
     Deletes the job. Deleting the job deletes are related pods.
     """
     logging.info("KILLING job {}".format(str(job_name)))
     result = retryable_check_output(args=namespaced_kubectl() + [
         'delete',
         '--ignore-not-found=true',  # in case we hit an edge case on retry
         'job',
         job_name
     ])
     logging.info(result)
    def get_pods(job_name):
        """
        Return pods for a given job_name

        :param job_name: a unique job name
        :type job_name: string
        :return: result form kubectl command that contains job pod and container information
        :type: dict

        """
        return json.loads(
            retryable_check_output(
                args=namespaced_kubectl() +
                ['get', 'pods', '-o', 'json', '-l',
                 'job-name==%s' % job_name]))
Ejemplo n.º 3
0
    def log_container_logs(self, job_name, pod_output=None):
        """
        Reads the logs from each container in each pod in the job, re-logs them back

        :param job_name: job that owns the pods with the containers we want to log
        :param pod_output: Result of get_pods(job_name) call. If None, will be
                           requested. This is a convenience so we can share/
                           reuse the results of get_pods()
        :return:
        """
        pod_output = pod_output or self.get_pods(job_name)
        for pod in pod_output['items']:
            pod_name = pod['metadata']['name']
            for container in pod['spec']['containers']:
                container_name = container['name']
                extra = dict(pod=pod_name, container=container_name)
                logging.info('LOGGING OUTPUT FROM JOB [%s/%s]:' %
                             (pod_name, container_name),
                             extra=extra)
                output = retryable_check_output(
                    args=namespaced_kubectl() +
                    ['logs', pod_name, container_name])
                for line in output.splitlines():
                    logging.info(line, extra=extra)
Ejemplo n.º 4
0
    def poll_job_completion(self,
                            job_name,
                            dependent_containers={'cloudsql-proxy'}):
        """
        Polls for completion of the created job.
        Sleeps for sleep_seconds_between_polling between polling.
        Any failed pods will raise an error and fail the KubernetesJobOperator task.
        """
        logging.info('Polling for completion of job: %s' % job_name)
        pod_output = None  # keeping this out here so we can reuse it in the "finally" clause

        has_live_existed = False
        while True:
            time.sleep(self.sleep_seconds_between_polling)

            pod_output = self.get_pods(job_name)

            job_description = json.loads(
                retryable_check_output(namespaced_kubectl() +
                                       ['get', 'job', "-o", "json", job_name]))

            status_block = job_description['status']

            if 'succeeded' in status_block and 'failed' in status_block:
                raise Exception(
                    "Invalid status block containing both succeeded and failed: %s",
                    json.dumps(status_block))

            if 'active' in status_block:
                status = 'running'
            elif 'failed' in status_block:
                status = "failed"
            elif 'succeeded' in status_block:
                status = 'complete'
            else:
                status = "pending"

            logging.info('Current status is: %s' % status)

            if "pending" == status:
                pass

            if "failed" == status:
                raise Exception('%s has failed pods, failing task.' % job_name)

            if "complete" == status:
                return pod_output

            # Determine if we have any containers left running in each pod of the job.
            # Dependent containers don't count.
            # If there are no pods left running anything, we are done here. Cleaning up
            # dependent containers will be left to the top-level `finally` block down below.
            has_live = False
            for pod in pod_output['items']:
                if 'Unknown' == pod['status']['phase']:
                    # we haven't run yet
                    has_live = True
                    break
                elif 'Pending' == pod['status']['phase']:
                    has_live = True
                    start_time_s = pod['status'].get('startTime')
                    if not start_time_s:
                        logging.info('Pod not yet started')
                        break
                    start_time = datetime.strptime(start_time_s,
                                                   "%Y-%m-%dT%H:%M:%SZ")
                    start_duration_secs = (datetime.utcnow() -
                                           start_time).total_seconds()
                    if start_duration_secs > 300:
                        raise Exception(
                            '%s has failed to start after %0.2f seconds' % (
                                job_name,
                                start_duration_secs,
                            ))
                elif 'Running' == pod['status']['phase']:
                    # get all of the independent containers that are still alive (running or waiting)
                    live_cnt = 0
                    for cs in pod['status']['containerStatuses']:
                        if cs['name'] in dependent_containers:
                            pass
                        elif 'terminated' in cs['state']:
                            has_live_existed = True
                            exit_code = int(cs['state']['terminated'].get(
                                'exitCode', 0))
                            if exit_code > 0:
                                raise Exception(
                                    '%s has failed pods, failing task.' %
                                    job_name)
                        else:
                            live_cnt += 1

                    if live_cnt > 0:
                        has_live = True
                        break
                elif 'Succeeded' == pod['status']['phase']:
                    # For us to end up in this block, the job has to be Running and the pod has to be Succeeded.
                    # This happens when (on a previous attempt) we successfully finished execution, killed dependent
                    # containers, and failed to delete the job.
                    # In this scenario, we want to immediately stop polling, and retry job deletion.
                    has_live_existed = True
                    has_live = False
                elif 'Failed' == pod['status']['phase']:
                    raise Exception("Containers failed!")
                else:
                    raise Exception(
                        "Encountered pod state {state} - no behavior has been prepared for pods in this state!"
                        .format(state=pod["status"]["phase"]))
            total_pods = len(pod_output['items'])
            logging.info(
                "total pods: {total_pods}".format(total_pods=total_pods))
            has_live_existed = has_live_existed or has_live
            # if we get to this point but for some reason there are no pods, log it and retry
            if not has_live_existed:
                logging.info('No pods have run. Retrying.')
            # we have no live pods, but live pods have existed.
            elif not has_live:
                logging.info('No live, independent pods left.')
                return pod_output
Ejemplo n.º 5
0
 def get_pods(self, job_name):
     return json.loads(
         retryable_check_output(
             args=namespaced_kubectl() +
             ['get', 'pods', '-o', 'json', '-l',
              'job-name==%s' % job_name]))