def reload(self): # refresh no matter what try: with podman.ApiConnection(podman_url) as podman_api: self.inspection = podman.containers.inspect(podman_api, self.name) except podman.errors.InternalServerError: logger.error(f"error 500 with {self.name}")
def _gather_system_facts(self, figures_by_course): # ds stands for disk_space if self._graphroot is None: with podman.ApiConnection(podman_url) as podman_api: self._graphroot = podman.system.info(podman_api)['store']['graphRoot'] nbhroot = sitesettings.nbhroot system_root = "/" disk_spaces = {} for name, root in (('container', self._graphroot), ('nbhosting', nbhroot), ('system', system_root)): disk_spaces[name] = {} try: stat = os.statvfs(root) disk_spaces[name]['percent'] = round(100 * stat.f_bfree / stat.f_blocks) # unit is MiB disk_spaces[name]['free'] = round((stat.f_bfree * stat.f_bsize) / (1024**2)) except Exception: disk_spaces[name]['free'] = 0 disk_spaces[name]['percent'] = 0 logger.exception( f"monitor cannot compute disk space {name} on {root}") # loads try: uptime_output = subprocess.check_output('uptime').decode().strip() end_of_line = uptime_output.split(':')[-1] floads = end_of_line.split(', ') load1, load5, load15 = [round(100*float(x)) for x in floads] except Exception: load1, load5, load15 = 0, 0, 0 logger.exception(f"monitor cannot compute cpu loads") loads = dict(load1=load1, load5=load5, load15=load15) # memory from /proc/meminfo try: def handle_line(line): _label, value, unit = line.split() if unit == 'kB': return int(value) * 1024 logger.warning(f"unexpected unit {unit} in meminfo") return 0 with open("/proc/meminfo") as feed: total_line = feed.readline() free_line = feed.readline() avail_line = feed.readline() total_mem = handle_line(total_line) free_mem = handle_line(free_line) avail_mem = handle_line(avail_line) except: logger.exception("failed to probe memory") total_mem, free_mem, avail_mem = 0, 0, 0 memory = dict(memory_total=total_mem, memory_free=free_mem, memory_available=avail_mem) return disk_spaces, loads, memory
def _scan_containers(self, figures_by_course): # initialize all known courses - we want data on all courses # even if they don't run any container yet logger.info(f"monitor cycle with period={self.period//60}' " f"idle={self.idle//60}' " f"lingering={self.lingering//3600}h") hash_by_course = {c.coursename : c.image_hash() for c in CourseDir.objects.all()} with podman.ApiConnection(podman_url) as podman_api: # returns None when no container is found ! containers = podman.containers.list_containers(podman_api, all=True) or [] logger.info(f"found {len(hash_by_course)} courses " f"and {len(containers)} containers") monitoreds = [] for container in containers: try: name = container['Names'][0] coursename, student = name.split('-x-') figures_by_course.setdefault(coursename, CourseFigures()) figures = figures_by_course[coursename] # may be None if s/t is misconfigured image_hash = hash_by_course[coursename] \ or f"hash not found for course {coursename}" monitoreds.append(MonitoredJupyter( container, coursename, student, figures, image_hash)) # typically non-nbhosting containers except ValueError: # ignore this container as we don't even know # in what course it belongs logger.info(f"ignoring non-nbhosting {container}") except KeyError: # typically hash_by_course[coursename] is failing # this may happen when a course gets outdated logger.info(f"ignoring container {container} - " f"can't find image hash for {coursename}") except Exception: logger.exception(f"monitor has to ignore {container}") # run the whole stuff futures = [mon.co_run(self.idle, self.lingering) for mon in monitoreds] #asyncio.run(asyncio.gather(*futures)) asyncio.get_event_loop().run_until_complete( asyncio.gather(*futures)) self.system_containers = len(monitoreds) self.system_kernels = sum((mon.nb_kernels or 0) for mon in monitoreds)
def kill_running_containers(self, *, containers=None, background=False): """ kills containers passed as arguments typically containers should be the result of self.spot_running_containers() and this is what is being called if containers is not provided background says whether this call should return immedialtely (background=True) or wait until the containers are actually killed (background=False) """ if containers is None: containers = self.spot_running_containers() with podman.ApiConnection(podman_url) as podman_api: for container in containers: podman.containers.kill(podman_api, container['Names'][0])
def spot_running_containers(self): """ returns a list of containers that are currently running under this student's name """ terminator = f"-x-{self.name}" with podman.ApiConnection(podman_url) as podman_api: # not specifying all=True means only the running ones containers = podman.containers.list_containers(podman_api) # keep only this student's containers # the drawback of using sparse=True however # is that the container structures are not fully filled # hence this convoluted way of chking for their names containers = [ container for container in containers if container['Names'][0].endswith(terminator) ] return containers
def show_course(cd, max_name, max_image, max_groups): col_name = f"{max_name+1}s" col_groups = f"{max_groups+1}s" autopull = "[AP]" if cd.autopull else "" autobuild = "[AB]" if cd.autobuild else "" archived = "[AR]" if cd.archived else "" flags = "".join([x for x in (autopull, autobuild, archived) if x]) flags = f"{flags:13s}" hash_part = f"{cd.current_hash():9s}" groups_part = f"{groups(cd):{col_groups}}" image = cd.image line = f"{cd.coursename:{col_name}}" if list_flag == 0: return line image_exists = None if list_flag >= 3: import podman podman_url = "unix://localhost/run/podman/podman.sock" with podman.ApiConnection(podman_url) as podman_api: image_exists = podman.images.image_exists(podman_api, cd.image) warning = "!" if not image_exists else " " image = f"{warning}{image}{warning}" # we may have 2 more characters in the image part max_image += 2 col_image = f"{max_image+1}s" image_part = f"{image:{col_image}}" line += image_part line += flags if list_flag == 1: return line line += hash_part line += groups_part line += f"{cd.giturl}" if image_exists is False: escape = chr(27) line = f"{escape}[1m{escape}[31m{line}{escape}[0m" return line
def remove_container(self): with podman.ApiConnection(podman_url) as podman_api: podman.containers.remove(podman_api, self.name)
def kill_container(self): # using a new connection each time turns out much more robust with podman.ApiConnection(podman_url) as podman_api: podman.containers.kill(podman_api, self.name)
def test_000_ctor(self): with podman.ApiConnection("unix:///") as api: pass
def test_001_join(self): with podman.ApiConnection("unix:///") as api: path = api.join("/unittest", {"q": "p"}) self.assertEqual("{}/unittest?q=p".format(api.base), path)
def _run_once(self, show_details, show_idle): """ The total number of containers is split like this: * total = stopped + running running = idle (0 kernels) + active (>= 1 kernel) Parameters: show_details: if True, print one line per container with last activity and # of kernels show_idle: if True, compute the number of containers that have no kernel """ with podman.ApiConnection(podman_url) as api: containers = podman.containers.list_containers(api) all_running = [c for c in containers if c['State'] == 'running'] all_stopped = [c for c in containers if c['State'] != 'running'] def monitored(container): name = container['Names'][0] course, student = name.split('-x-') # create one figures instance per container figures = CourseFigures() return MonitoredJupyter(container, course, student, figures, None) running_monitoreds = [ monitored(container) for container in all_running ] if show_details or show_idle: # probe them to fill las_activity and number_kernels futures = [ mon.count_running_kernels() for mon in running_monitoreds ] #loop.run_until_complete(asyncio.gather(*futures)) for future in futures: loop.run_until_complete(future) if show_details: running_monitoreds.sort(key=lambda mon: mon.last_activity or 0, reverse=True) now = time.time() width = max((len(c.name) for c in running_monitoreds), default=10) for index, mon in enumerate(running_monitoreds, 1): if mon.nb_kernels: # xxx this somehow shows up UTC # maybe it simply needs USE_TZ = True in the django settings la = mon.last_activity_human() ellapsed = int(now - mon.last_activity) // 60 print( f"{index:<3d}{mon.name:>{width}s} [{mon.nb_kernels:>2d}k] " f"last active {la} - {ellapsed:>3d} min ago") else: display = '?' if mon.nb_kernels is None else 0 print(f"{index:<3d}{mon.name:>{width}s} [-{display}-] ") if show_details: ban = self.now() sep = "\n" else: ban = sep = "" def print_line(stopped, monitoreds, msg): if show_idle: nb_stopped = len(stopped) nb_idle = sum((mon.nb_kernels == 0 or mon.nb_kernels is None) for mon in monitoreds) nb_active = len(monitoreds) - nb_idle total_kernels = sum( (mon.nb_kernels or 0) for mon in monitoreds) total = nb_stopped + nb_idle + nb_active print(self.now(), f"{msg} {nb_stopped} stopped + " f"({nb_idle} idle + {nb_active} active) " f"= {total} containers" f" with {total_kernels} kernels", end=sep) else: nb_stopped = len(stopped) nb_running = len(monitoreds) total = nb_stopped + nb_running print(self.now(), f"{msg} {nb_stopped} stopped + " f"{nb_running} running = {total} " f"containers", end=sep) print_line(all_stopped, running_monitoreds, "ALL") if self.patterns: selected_stopped = [c for c in all_stopped if self.in_scope(c)] selected_running = [ mon for mon in running_monitoreds if self.in_scope(mon) ] if self.continuous: print() print_line(selected_stopped, selected_running, "SEL")