Example #1
0
 async def _watch_docker_events(self):
     """ Get raw docker events and convert them to more readable objects, and then give them to self._docker_events_subscriber """
     try:
         source = AsyncIteratorWrapper(
             self._docker.event_stream(filters={"event": ["die", "oom"]}))
         async for i in source:
             if i["Type"] == "container" and i["status"] == "die":
                 container_id = i["id"]
                 try:
                     retval = int(i["Actor"]["Attributes"]["exitCode"])
                 except:
                     self._logger.exception(
                         "Cannot parse exitCode for container %s",
                         container_id)
                     retval = -1
                 await ZMQUtils.send(
                     self._docker_events_publisher,
                     EventContainerDied(container_id, retval))
             elif i["Type"] == "container" and i["status"] == "oom":
                 await ZMQUtils.send(self._docker_events_publisher,
                                     EventContainerOOM(i["id"]))
             else:
                 raise TypeError(str(i))
     except:
         self._logger.exception("Exception in _watch_docker_events")
Example #2
0
    async def _watch_docker_events(self):
        """ Get raw docker events and convert them to more readable objects, and then give them to self._docker_events_subscriber """
        try:
            source = AsyncIteratorWrapper(self._docker.event_stream(filters={"event": ["die", "oom"]}))
            async for i in source:
                if i["Type"] == "container" and i["status"] == "die":
                    container_id = i["id"]
                    try:
                        retval = int(i["Actor"]["Attributes"]["exitCode"])
                    except:
                        self._logger.exception("Cannot parse exitCode for container %s", container_id)
                        retval = -1

                    if container_id in self._containers_running:
                        self._loop.create_task(self.handle_job_closing(container_id, retval))
                    elif container_id in self._student_containers_running:
                        self._loop.create_task(self.handle_student_job_closing(container_id, retval))
                    elif container_id in self._batch_containers_running:
                        self._loop.create_task(self.handle_batch_job_closing(container_id, retval))
                elif i["Type"] == "container" and i["status"] == "oom":
                    container_id = i["id"]
                    if container_id in self._containers_running or container_id in self._student_containers_running:
                        self._logger.info("Container %s did OOM, killing it", container_id)
                        self._containers_killed[container_id] = "overflow"
                        try:
                            self._loop.create_task(self._loop.run_in_executor(None, lambda: self._docker.kill_container(container_id)))
                        except:  # this call can sometimes fail, and that is normal.
                            pass
                else:
                    raise TypeError(str(i))
        except:
            self._logger.exception("Exception in _watch_docker_events")
 async def _handle_container_timeout(self, container_id, timeout):
     """
     Check timeout with docker stats
     :param container_id:
     :param timeout: in seconds (cpu time)
     """
     try:
         docker_stats = await self._docker_interface.get_stats(container_id)
         source = AsyncIteratorWrapper(docker_stats)
         nano_timeout = timeout * (10**9)
         async for upd in source:
             if upd is None:
                 await self._kill_it_with_fire(container_id)
             self._logger.debug(
                 "%i", upd['cpu_stats']['cpu_usage']['total_usage'])
             if upd['cpu_stats']['cpu_usage']['total_usage'] > nano_timeout:
                 self._logger.info(
                     "Killing container %s as it used %i CPU seconds (max was %i)",
                     container_id,
                     int(upd['cpu_stats']['cpu_usage']['total_usage'] /
                         (10**9)), timeout)
                 await self._kill_it_with_fire(container_id)
                 return
     except asyncio.CancelledError:
         pass
     except:
         self._logger.exception("Exception in _handle_container_timeout")
Example #4
0
    async def _watch_docker_events(self):
        """
            Get raw docker events and convert them to more readable objects, and then give them to self._docker_events_subscriber.
            This function should always be active while the agent is itself active, hence the while True.
        """
        shutdown = False
        since = None  # last time we saw something. Useful if a restart happens...
        while not shutdown:
            try:
                source = AsyncIteratorWrapper(
                    self._docker.sync.event_stream(
                        filters={"event": ["die", "oom"]}, since=since))
                self._logger.info("Docker event stream started")
                async for i in source:
                    since = i.get('time', since)  # update time if available.

                    if i["Type"] == "container" and i["status"] == "die":
                        container_id = i["id"]
                        try:
                            retval = int(i["Actor"]["Attributes"]["exitCode"])

                        except asyncio.CancelledError:
                            raise
                        except:
                            self._logger.exception(
                                "Cannot parse exitCode for container %s",
                                container_id)
                            retval = -1

                        if container_id in self._containers_running:
                            self._create_safe_task(
                                self.handle_job_closing(container_id, retval))
                        elif container_id in self._student_containers_running:
                            self._create_safe_task(
                                self.handle_student_job_closing(
                                    container_id, retval))
                    elif i["Type"] == "container" and i["status"] == "oom":
                        container_id = i["id"]
                        if container_id in self._containers_running or container_id in self._student_containers_running:
                            self._logger.info(
                                "Container %s did OOM, killing it",
                                container_id)
                            self._containers_killed[container_id] = "overflow"
                            try:
                                self._create_safe_task(
                                    self._docker.kill_container(container_id))
                            except asyncio.CancelledError:
                                raise
                            except:  # this call can sometimes fail, and that is normal.
                                pass
                    else:
                        raise TypeError(str(i))
                raise Exception(
                    "Docker stopped feeding the event stream. This should not happen. Restarting the event stream..."
                )
            except asyncio.CancelledError:
                shutdown = True
            except:
                self._logger.exception("Exception in _watch_docker_events")