Beispiel #1
0
def mesos_update_to_event(mesos_status: addict.Dict,
                          task_config: MesosTaskConfig) -> Event:
    kwargs = dict(
        raw=mesos_status,
        task_id=task_config.task_id,
        task_config=task_config,
        timestamp=time.time(),
    )
    kwargs.update(MESOS_STATUS_MAP[mesos_status.state])
    return task_event(**kwargs)
Beispiel #2
0
    def __handle_deleted_pod_event(self, event: PodEvent) -> None:
        pod = event["object"]
        pod_name = pod.metadata.name
        task_metadata = self.task_metadata[pod_name]
        raw_event = event['raw_object']

        logger.info(
            f"Removing {pod_name} from state and emitting 'killed' event.")

        self.task_metadata = self.task_metadata.discard(pod_name)
        self.event_queue.put(
            task_event(task_id=pod_name,
                       terminal=True,
                       success=False,
                       timestamp=time.time(),
                       raw=raw_event,
                       task_config=task_metadata.task_config,
                       platform_type="killed"))
Beispiel #3
0
    def reconcile(self, task_config: KubernetesTaskConfig) -> None:
        pod_name = task_config.pod_name
        try:
            pod = self.kube_client.get_pod(namespace=self.namespace,
                                           pod_name=pod_name)
        except Exception:
            logger.exception(
                f"Hit an exception attempting to fetch pod {pod_name}")
            pod = None

        if pod_name not in self.task_metadata:
            self._initialize_existing_task(task_config)

        with self.task_metadata_lock:
            task_metadata = self.task_metadata[pod_name]
            self.task_metadata = self.task_metadata.set(
                pod_name, task_metadata.set(task_config=task_config))

            if not pod:
                # Pod has gone away while restarting
                logger.info(
                    f"Pod {pod_name} for task {task_config.name} was no longer found. "
                    "Marking as LOST")
                self.task_metadata = self.task_metadata.set(
                    pod_name,
                    task_metadata.set(
                        task_state=KubernetesTaskState.TASK_LOST,
                        task_state_history=task_metadata.task_state_history.
                        append(
                            (KubernetesTaskState.TASK_LOST, time.time()), )))
                self.event_queue.put(
                    task_event(task_id=pod_name,
                               terminal=False,
                               timestamp=time.time(),
                               raw=None,
                               task_config=task_metadata.task_config,
                               platform_type="lost"))
            else:
                # Treat like a modified pod
                self.__update_modified_pod(pod=pod, event=None)
    def _background_check_task(self, time_now, tasks_to_reconcile, task_id, md):
        if md.task_state != 'TASK_INITED':
            tasks_to_reconcile.append(task_id)

        if md.task_state == 'TASK_INITED':
            # give up if the task hasn't launched after
            # offer_timeout
            inited_at = md.task_state_history['TASK_INITED']
            offer_timeout = md.task_config.offer_timeout
            expires_at = inited_at + offer_timeout
            if time_now >= expires_at:
                log.warning(
                    f'Task {task_id} has been waiting for offers '
                    'for longer than configured timeout '
                    f'{offer_timeout}. Giving up and removing the '
                    'task from the task queue.'
                )
                # killing the task will also remove them from the queue
                self.kill_task(task_id)
                # we are not expecting mesos to send terminal update
                # for this task, so cleaning it up manually
                self.task_metadata = self.task_metadata.discard(
                    task_id
                )
                self.event_queue.put(
                    task_event(
                        task_id=task_id,
                        terminal=True,
                        timestamp=time_now,
                        success=False,
                        message='stop',
                        task_config=md.task_config,
                        raw='Failed due to offer timeout',
                    )
                )
                get_metric(metrics.TASK_OFFER_TIMEOUT).count(1)

        # Task is not eligible for killing or reenqueuing
        in_current_state_since = md.task_state_history[md.task_state]
        if time_now < in_current_state_since + self.task_staging_timeout_s:
            return

        if md.task_state == 'UNKNOWN':
            log.warning(
                f'Re-enqueuing task {task_id} in unknown state for '
                f'longer than {self.task_staging_timeout_s}'
            )
            # Re-enqueue task
            self.enqueue_task(md.task_config)
            get_metric(
                metrics.TASK_FAILED_TO_LAUNCH_COUNT).count(1)
        elif md.task_state == 'TASK_STAGING':
            log.warning(f'Killing stuck task {task_id}')
            self.kill_task(task_id)
            self.task_metadata = self.task_metadata.set(
                task_id,
                md.set(
                    task_state='TASK_STUCK',
                    task_state_history=md.task_state_history.set(
                        'TASK_STUCK', time_now),
                )
            )
            self.blacklist_slave(
                agent_id=self.task_metadata[task_id].agent_id,
                timeout=self.slave_blacklist_timeout_s,
            )
            get_metric(metrics.TASK_STUCK_COUNT).count(1)
        elif md.task_state == 'TASK_STUCK':
            t = time.time()
            # 10s since last iteration + time we spent in current one
            time_delta = 10 + t - time_now
            # seconds since task was put in TASK_STUCK state
            time_stuck = t - md.task_state_history['TASK_STUCK']
            # seconds since `time_stuck` crossed another hour
            # boundary
            hour_rolled = time_stuck % 3600

            # if `time_stuck` crossed hour boundary since last
            # background check - lets re-send kill request
            if hour_rolled < time_delta:
                hours_stuck = time_stuck // 3600
                log.warning(
                    f'Task {task_id} is stuck, waiting for terminal '
                    f'state for {hours_stuck}h, sending another kill'
                )
                self.kill_task(task_id)
import time

from task_processing.interfaces.event import task_event

# https://github.com/apache/mesos/blob/master/include/mesos/mesos.proto

MESOS_STATUS_MAP = {
    'TASK_STARTING':
    task_event(platform_type='starting', terminal=False),
    'TASK_RUNNING':
    task_event(platform_type='running', terminal=False),
    'TASK_FINISHED':
    task_event(platform_type='finished', terminal=True, success=True),
    'TASK_FAILED':
    task_event(platform_type='failed', terminal=True, success=False),
    'TASK_KILLED':
    task_event(platform_type='killed', terminal=True, success=False),
    'TASK_LOST':
    task_event(platform_type='lost', terminal=True, success=False),
    'TASK_STAGING':
    task_event(platform_type='staging', terminal=False),
    'TASK_ERROR':
    task_event(platform_type='error', terminal=True, success=False),
    'TASK_KILLING':
    task_event(platform_type='killing', terminal=False),
    'TASK_DROPPED':
    task_event(platform_type='dropped', terminal=True, success=False),
    'TASK_UNREACHABLE':
    task_event(platform_type='unreachable', terminal=False),
    'TASK_GONE':
    task_event(platform_type='gone', terminal=True, success=False),
Beispiel #6
0
    def __update_modified_pod(self, pod: V1Pod,
                              event: Optional[PodEvent]) -> None:
        """ Called during reconciliation and normal event handling """
        pod_name = pod.metadata.name
        task_metadata = self.task_metadata[pod_name]

        raw_event = event['raw_object'] if event else None

        if pod.status.phase not in SUPPORTED_POD_MODIFIED_EVENT_PHASES:
            logger.debug(
                f"Got a MODIFIED event for {pod_name} for unhandled phase: "
                f"{pod.status.phase} - ignoring.")
            return

        if (pod.status.phase in {"Succeeded", "Failed"} and
                task_metadata.task_state is KubernetesTaskState.TASK_PENDING):
            logger.debug(
                f"Adding running event for {pod_name}, Kubernetes appears to have "
                "compacted the Running phase event.")
            self.task_metadata = self.task_metadata.set(
                pod_name,
                task_metadata.set(
                    node_name=pod.spec.node_name,
                    task_state=KubernetesTaskState.TASK_RUNNING,
                    task_state_history=task_metadata.task_state_history.append(
                        (KubernetesTaskState.TASK_RUNNING, time.time()), )))
            self.event_queue.put(
                task_event(task_id=pod_name,
                           terminal=False,
                           timestamp=time.time(),
                           raw=raw_event,
                           task_config=task_metadata.task_config,
                           platform_type="running"))

        if (pod.status.phase == "Succeeded" and task_metadata.task_state
                is not KubernetesTaskState.TASK_FINISHED):
            logger.info(
                f"Removing {pod_name} from state and emitting 'finished' event.",
            )
            self.task_metadata = self.task_metadata.discard(pod_name)
            self.event_queue.put(
                task_event(task_id=pod_name,
                           terminal=True,
                           success=True,
                           timestamp=time.time(),
                           raw=raw_event,
                           task_config=task_metadata.task_config,
                           platform_type="finished"))
            return

        elif (pod.status.phase == "Failed" and task_metadata.task_state
              is not KubernetesTaskState.TASK_FAILED):
            logger.info(
                f"Removing {pod_name} from state and emitting 'failed' event.")
            self.task_metadata = self.task_metadata.discard(pod_name)
            self.event_queue.put(
                task_event(task_id=pod_name,
                           terminal=True,
                           success=False,
                           timestamp=time.time(),
                           raw=raw_event,
                           task_config=task_metadata.task_config,
                           platform_type="failed"))
            return

        elif (pod.status.phase == "Running" and task_metadata.task_state
              is not KubernetesTaskState.TASK_RUNNING):
            logger.info(
                f"Successfully launched {pod_name}, emitting 'running' event.")
            self.task_metadata = self.task_metadata.set(
                pod_name,
                task_metadata.set(
                    node_name=pod.spec.node_name,
                    task_state=KubernetesTaskState.TASK_RUNNING,
                    task_state_history=task_metadata.task_state_history.append(
                        (KubernetesTaskState.TASK_RUNNING, time.time()), )))
            self.event_queue.put(
                task_event(task_id=pod_name,
                           terminal=False,
                           timestamp=time.time(),
                           raw=raw_event,
                           task_config=task_metadata.task_config,
                           platform_type="running"))
            return

        # XXX: figure out how to handle this correctly (and when this actually
        # happens - we were unable to cajole k8s into giving us an event with an Unknown
        # phase)
        elif (pod.status.phase == "Unknown" and task_metadata.task_state
              is not KubernetesTaskState.TASK_LOST):
            logger.info(
                f"Got a MODIFIED event for {pod_name} with unknown phase, host likely "
                "unexpectedly died")
            self.task_metadata = self.task_metadata.set(
                pod_name,
                task_metadata.set(
                    node_name=pod.spec.node_name,
                    task_state=KubernetesTaskState.TASK_LOST,
                    task_state_history=task_metadata.task_state_history.append(
                        (KubernetesTaskState.TASK_LOST, time.time()), )))
            self.event_queue.put(
                task_event(task_id=pod_name,
                           terminal=False,
                           timestamp=time.time(),
                           raw=raw_event,
                           task_config=task_metadata.task_config,
                           platform_type="lost"))
            return

        logger.info(
            f"Ignoring MODIFIED event for {pod_name} as it did not result "
            "in a state transition", )