Ejemplo n.º 1
0
    async def count_running_kernels(self):
        """
        updates:
        * self.figures with number of running kernels
        * self.last_activity - a epoch/timestamp/nb of seconds
          may be None if using an old jupyter
        """
        port = self.port_number()
        if not port:
            return
        url = f"http://localhost:{port}/{port}/api/kernels?token={self.name}"
        self.last_activity = None
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    json_str = await response.text()
            api_kernels = json.loads(json_str)
            self.nb_kernels = len(api_kernels)

            last_times = [
                self.last_time(api_kernel) for api_kernel in api_kernels
            ]
            # if times is empty (no kernel): no activity
            self.last_activity = max(last_times, default=0)

        # this somehow tends to happen a lot sometimes
        # until we figure it out, let's make it less conspicuous
        except ClientConnectionError as _exc:
            logger.info(f"could not reach warming up {url} for last activity")

        except Exception:
            logger.exception(f"Cannot probe number of kernels with {self} - unhandled exception")
Ejemplo n.º 2
0
    def last_time(kernel_data):
        """
        expects as input the data returned by /api/kernels
        for one kernel, that is to say e.g.:
        {'connections': 1,
         'execution_state': 'idle',
         'id': '15be5b4c-b5f2-46f0-9a9b-ff54f4495cb4',
         'last_activity': '2018-02-19T12:58:25.204761Z',
         'name': 'python3'}

        returns a comparable time (using max) that this kernel
        has been doing something

        Notes:
        * cases where connections = 0 should not be disregarded
          it is important to keep those alive, it does not indicate
          a lack of activity
        * last_activity format: we found some items where the milliseconds
          part was simply not present (at all, i.e. not exposed as .0 or anything)
        * if anything goes wrong, it's best to return a timestamp that means 'now'
          rather than the epoch
        """
        try:
            last_activity = kernel_data['last_activity']
            return MonitoredJupyter.parse_time(last_activity)
        except Exception:
            logger.exception(f"last_time failed with kernel_data = {kernel_data}")
            # to stay on the safe side, return current time
            return time.time()
Ejemplo n.º 3
0
 async def co_run(self, idle, lingering):
     try:
         await self._co_run(idle, lingering)
     except Exception as exc:
         # xx used to be a simple error but until pip podman 3.x is settled
         # it's probably best like this
         logger.exception(
             f"unexpected error {type(exc)} "
             f"when dealing with {self.name} - ignored\n...exception={exc}")
Ejemplo n.º 4
0
 def run_once(self):
     try:
         return self._run_once()
     except podman.errors.InternalServerError as exc:
         reporter = logger.exception if sitesettings.DEBUG else logger.error
         reporter(f"{exc} - skipping rest of monitor cycle")
     except Exception:
         logger.exception(
             "Something wrong happened during monitor cycle - skipping")
         return
Ejemplo n.º 5
0
    def _scan_containers(self, figures_by_course):

        # initialize all known courses - we want data on all courses
        # even if they don't run any container yet
        logger.info(f"monitor cycle with period={self.period//60}' "
                    f"idle={self.idle//60}' "
                    f"lingering={self.lingering//3600}h")
        hash_by_course = {c.coursename : c.image_hash()
                          for c in CourseDir.objects.all()}

        with podman.ApiConnection(podman_url) as podman_api:
            # returns None when no container is found !
            containers = podman.containers.list_containers(podman_api, all=True) or []
        logger.info(f"found {len(hash_by_course)} courses "
                    f"and {len(containers)} containers")


        monitoreds = []
        for container in containers:
            try:
                name = container['Names'][0]
                coursename, student = name.split('-x-')
                figures_by_course.setdefault(coursename, CourseFigures())
                figures = figures_by_course[coursename]
                # may be None if s/t is misconfigured
                image_hash = hash_by_course[coursename] \
                       or f"hash not found for course {coursename}"
                monitoreds.append(MonitoredJupyter(
                    container, coursename, student,
                    figures, image_hash))
            # typically non-nbhosting containers
            except ValueError:
                # ignore this container as we don't even know
                # in what course it belongs
                logger.info(f"ignoring non-nbhosting {container}")
            except KeyError:
                # typically hash_by_course[coursename] is failing
                # this may happen when a course gets outdated
                logger.info(f"ignoring container {container} - "
                            f"can't find image hash for {coursename}")
            except Exception:
                logger.exception(f"monitor has to ignore {container}")
                                
        # run the whole stuff
        futures = [mon.co_run(self.idle, self.lingering)
                   for mon in monitoreds]
        
        #asyncio.run(asyncio.gather(*futures))
        asyncio.get_event_loop().run_until_complete(
            asyncio.gather(*futures))
        
        self.system_containers = len(monitoreds)
        self.system_kernels = sum((mon.nb_kernels or 0) for mon in monitoreds)
Ejemplo n.º 6
0
    def run_forever(self):
        tick = time.time()

        # one cycle can take some time as all the jupyters need to be http-probed
        # so let us compute the actual time to wait
        logger.info("nbh-monitor is starting up")
        for c in CourseDir.objects.all():
            Stats(c.coursename).record_monitor_known_counts_line()
        while True:
            try:
                self.run_once()
            # just be extra sure it doesn't crash
            except Exception:
                logger.exception(f"Unexpected error")
            tick += self.period
            duration = max(0, int(tick - time.time()))
            logger.info(f"monitor is waiting for {duration}s")
            time.sleep(duration)
Ejemplo n.º 7
0
    def _gather_system_facts(self, figures_by_course):
        # ds stands for disk_space
        if self._graphroot is None:
            with podman.ApiConnection(podman_url) as podman_api:
                self._graphroot = podman.system.info(podman_api)['store']['graphRoot']
        nbhroot = sitesettings.nbhroot
        system_root = "/"
        disk_spaces = {}
        for name, root in (('container', self._graphroot),
                           ('nbhosting', nbhroot),
                           ('system', system_root)):
            disk_spaces[name] = {}
            try:
                stat = os.statvfs(root)
                disk_spaces[name]['percent'] = round(100 * stat.f_bfree / stat.f_blocks)
                # unit is MiB
                disk_spaces[name]['free'] = round((stat.f_bfree * stat.f_bsize) / (1024**2))

            except Exception:
                disk_spaces[name]['free'] = 0
                disk_spaces[name]['percent'] = 0
                logger.exception(
                    f"monitor cannot compute disk space {name} on {root}")

        # loads
        try:
            uptime_output = subprocess.check_output('uptime').decode().strip()
            end_of_line = uptime_output.split(':')[-1]
            floads = end_of_line.split(', ')
            load1, load5, load15 = [round(100*float(x)) for x in floads]

        except Exception:
            load1, load5, load15 = 0, 0, 0
            logger.exception(f"monitor cannot compute cpu loads")

        loads = dict(load1=load1, load5=load5, load15=load15)

        # memory from /proc/meminfo
        try:
            def handle_line(line):
                _label, value, unit = line.split()
                if unit == 'kB':
                    return int(value) * 1024
                logger.warning(f"unexpected unit {unit} in meminfo")
                return 0
            with open("/proc/meminfo") as feed:
                total_line = feed.readline()
                free_line  = feed.readline()
                avail_line  = feed.readline()
                total_mem = handle_line(total_line)
                free_mem = handle_line(free_line)
                avail_mem = handle_line(avail_line)
        except:
            logger.exception("failed to probe memory")
            total_mem, free_mem, avail_mem = 0, 0, 0

        memory = dict(memory_total=total_mem, memory_free=free_mem, memory_available=avail_mem)

        return disk_spaces, loads, memory
Ejemplo n.º 8
0
 def port_number(self):
     try:
         return self.container['Ports'][0]['hostPort']
     except Exception:
         logger.exception(f"Cannot locate port number for {self}")
         return 0