def run_forever(self): tick = time.time() # one cycle can take some time as all the jupyters need to be http-probed # so let us compute the actual time to wait logger.info("nbh-monitor is starting up") coursenames = CoursesDir().coursenames() for coursename in coursenames: Stats(coursename).record_monitor_known_counts_line() while True: try: self.run_once() # just be extra sure it doesn't crash except Exception: logger.exception(f"Unexpected error") tick += self.period duration = max(0, int(tick - time.time())) logger.info(f"monitor is waiting for {duration}s") time.sleep(duration)
async def co_run(self, grace): nbhroot = Path(sitesettings.nbhroot) # stopped containers are useful only for statistics if self.container.status != 'running': self.figures.count_container(False) return # count number of kernels and last activity await self.count_running_kernels() # last_activity may be 0 if no kernel is running inside that container # or None if we could not determine it properly if self.last_activity is None: logger.error( "skipping container {} with no known last_activity".format( self.name)) return # check there has been activity in the last <grace> seconds now = time.time() grace_past = now - grace idle_minutes = (now - self.last_activity) // 60 if self.last_activity > grace_past: logger.debug("sparing {} that had activity {}' ago".format( self, idle_minutes)) self.figures.count_container(True, self.nb_kernels) else: if self.last_activity: logger.info("{} has been idle for {} mn - killing".format( self, idle_minutes)) else: logger.info("{} has no kernel attached - killing".format(self)) # kill it self.container.kill() # if that container does not run the expected image hash # it is because the course image was upgraded in the meanwhile # then we even remove the container so it will get re-created # next time with the right image this time actual_hash = self.container.image.id if actual_hash != self.hash: logger.info( "removing container {} - has hash {} instead of expected {}" .format(self.name, actual_hash[:15], self.hash[:15])) self.container.remove(v=True) # this counts for one dead container self.figures.count_container(False) # keep track or that removal in events.raw Stats(self.course).record_kill_jupyter(self.student)
def run_once(self): # initialize all known courses - we want data on courses # even if they don't run any container yet logger.debug("scanning courses") coursesdir = CoursesDir() coursenames = coursesdir.coursenames() figures_by_course = { coursename: CourseFigures() for coursename in coursenames } try: proxy = docker.from_env(version='auto') logger.debug("scanning containers") containers = proxy.containers.list(all=True) hash_by_course = { coursename: CourseDir(coursename).image_hash(proxy) for coursename in coursenames } except Exception as e: logger.exception( "Cannot gather containers list at the docker daemon - skipping" ) return # a list of async futures futures = [] for container in containers: try: name = container.name # too much spam ven in debug mode # logger.debug("dealing with container {}".format(name)) coursename, student = name.split('-x-') figures_by_course.setdefault(coursename, CourseFigures()) figures = figures_by_course[coursename] # may be None if s/t is misconfigured hash = hash_by_course[coursename] \ or "hash not found for course {}".format(coursename) monitored_jupyter = MonitoredJupyter(container, coursename, student, figures, hash) futures.append(monitored_jupyter.co_run(self.grace)) # typically non-nbhosting containers except ValueError as e: # ignore this container as we don't even know # in what course it logger.info("ignoring non-nbhosting {}".format(container)) except Exception as e: logger.exception( "ignoring {} in monitor - unexpected exception".format( container)) # ds stands for disk_space docker_root = proxy.info()['DockerRootDir'] nbhroot = sitesettings.nbhroot system_root = "/" ds = {} for name, root in ( ('docker', docker_root), ('nbhosting', nbhroot), ('system', system_root), ): ds[name] = {} try: stat = os.statvfs(root) ds[name]['percent'] = round(100 * stat.f_bfree / stat.f_blocks) # unit is MiB ds[name]['free'] = round( (stat.f_bfree * stat.f_bsize) / (1024**2)) except Exception as e: ds[name]['free'] = 0 ds[name]['percent'] = 0 logger.exception( "monitor cannot compute disk space with name {} on {}". format(name, root)) # loads try: uptime_output = subprocess.check_output('uptime').decode().strip() end_of_line = uptime_output.split(':')[-1] floads = end_of_line.split(', ') load1, load5, load15 = [round(100 * float(x)) for x in floads] except Exception as e: load1, load5, load15 = 0, 0, 0 logger.exception("monitor cannot compute cpu loads") # run the whole stuff asyncio.get_event_loop().run_until_complete(asyncio.gather(*futures)) # write results for coursename, figures in figures_by_course.items(): student_homes = CourseDir(coursename).student_homes() Stats(coursename).record_monitor_counts( figures.running_containers, figures.frozen_containers, figures.running_kernels, student_homes, load1, load5, load15, ds['docker']['percent'], ds['docker']['free'], ds['nbhosting']['percent'], ds['nbhosting']['free'], ds['system']['percent'], ds['system']['free'], )
async def co_run(self, idle, unused): """ both timeouts in seconds """ now = time.time() actual_hash = self.container.image.id # stopped containers need to be handled a bit differently if self.container.status != 'running': if actual_hash != self.image_hash: logger.info(f"Removing (stopped & outdated) {self} " f"that has outdated hash {actual_hash[:15]} " f"vs expected {self.image_hash[:15]}") self.container.remove(v=True) else: exited_time = self.exited_time() unused_days = (int)((now - exited_time) // (24 * 3600)) unused_hours = (int)((now - exited_time) // (3600) % 24) if (now - exited_time) > unused: logger.info(f"Removing (stopped & unused) {self} " f"that has been unused for {unused_days} days " f"{unused_hours} hours") self.container.remove(v=True) else: logger.debug( f"Ignoring stopped {self} that " f"exited {unused_days} days {unused_hours} hours ago") self.figures.count_container(False) return # count number of kernels and last activity await self.count_running_kernels() # last_activity may be 0 if no kernel is running inside that container # or None if we could not determine it properly if self.last_activity is None: logger.error( f"Skipping running {self} with no known last_activity") return # check there has been activity in the last grace_idle_in_minutes idle_minutes = (int)((now - self.last_activity) // 60) if (now - self.last_activity) < idle: logger.debug( f"Sparing running {self} that had activity {idle_minutes} mn ago" ) self.figures.count_container(True, self.nb_kernels) else: if self.last_activity: logger.info(f"Killing (running & idle) {self} " f"that has been idle for {idle_minutes} mn") else: logger.info(f"Killing (running and empty){self} " f"that has no kernel attached") # kill it self.container.kill() # keep track or that removal in events.raw Stats(self.course).record_kill_jupyter(self.student) # if that container does not run the expected image hash # it is because the course image was upgraded in the meanwhile # then we even remove the container so it will get re-created # next time with the right image this time if actual_hash != self.image_hash: logger.info(f"Removing (just killed & outdated) {self} " f"that has outdated hash {actual_hash[:15]} " f"vs expected {self.image_hash[:15]}") self.container.remove(v=True) else: # this counts for one dead container self.figures.count_container(False)