Beispiel #1
0
    async def count_running_kernels(self):
        """
        updates:
        * self.figures with number of running kernels
        * self.last_activity - a epoch/timestamp/nb of seconds
          may be None if using an old jupyter
        """
        port = self.port_number()
        if not port:
            return
        url = "http://localhost:{}/api/kernels?token={}"\
            .format(port, self.name)
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    json_str = await response.text()
            api_kernels = json.loads(json_str)
            self.nb_kernels = len(api_kernels)

            last_times = [
                self.last_time(api_kernel) for api_kernel in api_kernels
            ]
            # if times is empty (no kernel): no activity
            self.last_activity = max(last_times, default=0)

        except Exception as e:
            logger.exception(
                "Cannot probe number of kernels in {} - {}: {}".format(
                    self, type(e), e))
            self.last_activity = None
Beispiel #2
0
 def port_number(self):
     try:
         return int(self.container.attrs['NetworkSettings']['Ports']
                    ['8888/tcp'][0]['HostPort'])
     except Exception:
         logger.exception(f"Cannot locate port number for {self}")
         return 0
Beispiel #3
0
    def last_time(kernel_data):
        """
        expects as input the data returned by /api/kernels
        for one kernel, that is to say e.g.:
        {'connections': 1,
         'execution_state': 'idle',
         'id': '15be5b4c-b5f2-46f0-9a9b-ff54f4495cb4',
         'last_activity': '2018-02-19T12:58:25.204761Z',
         'name': 'python3'}

        returns a comparable time (using max) that this kernel
        has been doing something

        Notes:
        * cases where connections = 0 should not be disregarded
          it is important to keep those alive, it does not indicate
          a lack of activity
        * last_activity format: we found some items where the milliseconds
          part was simply not present (at all, i.e. not exposed as .0 or anything)
        * if anything goes wrong, it's best to return a timestamp that means 'now'
          rather than the epoch
        """
        try:
            last_activity = kernel_data['last_activity']
            return MonitoredJupyter.parse_docker_time(last_activity)
        except Exception:
            logger.exception(
                f"last_time failed with kernel_data = {kernel_data}")
            # to stay on the safe side, return current time
            return time.time()
Beispiel #4
0
    def run_forever(self):
        tick = time.time()

        # one cycle can take some time as all the jupyters need to be http-probed
        # so let us compute the actual time to wait
        logger.info("nbh-monitor is starting up")
        coursenames = CoursesDir().coursenames()
        for coursename in coursenames:
            Stats(coursename).record_monitor_known_counts_line()
        while True:
            try:
                self.run_once()
            # just be extra sure it doesn't crash
            except Exception:
                logger.exception(f"Unexpected error")
            tick += self.period
            duration = max(0, int(tick - time.time()))
            logger.info(f"monitor is waiting for {duration}s")
            time.sleep(duration)
Beispiel #5
0
    def run_once(self):

        # initialize all known courses - we want data on courses
        # even if they don't run any container yet
        logger.debug("scanning courses")
        coursesdir = CoursesDir()
        coursenames = coursesdir.coursenames()
        figures_by_course = {
            coursename: CourseFigures()
            for coursename in coursenames
        }

        try:
            proxy = docker.from_env(version='auto')
            logger.debug("scanning containers")
            containers = proxy.containers.list(all=True)
            hash_by_course = {
                coursename: CourseDir(coursename).image_hash(proxy)
                for coursename in coursenames
            }
        except Exception as e:
            logger.exception(
                "Cannot gather containers list at the docker daemon - skipping"
            )
            return

        # a list of async futures
        futures = []
        for container in containers:
            try:
                name = container.name
                # too much spam ven in debug mode
                # logger.debug("dealing with container {}".format(name))
                coursename, student = name.split('-x-')
                figures_by_course.setdefault(coursename, CourseFigures())
                figures = figures_by_course[coursename]
                # may be None if s/t is misconfigured
                hash = hash_by_course[coursename] \
                       or "hash not found for course {}".format(coursename)
                monitored_jupyter = MonitoredJupyter(container, coursename,
                                                     student, figures, hash)
                futures.append(monitored_jupyter.co_run(self.grace))
            # typically non-nbhosting containers
            except ValueError as e:
                # ignore this container as we don't even know
                # in what course it
                logger.info("ignoring non-nbhosting {}".format(container))
            except Exception as e:
                logger.exception(
                    "ignoring {} in monitor - unexpected exception".format(
                        container))
        # ds stands for disk_space
        docker_root = proxy.info()['DockerRootDir']
        nbhroot = sitesettings.nbhroot
        system_root = "/"
        ds = {}
        for name, root in (
            ('docker', docker_root),
            ('nbhosting', nbhroot),
            ('system', system_root),
        ):
            ds[name] = {}
            try:
                stat = os.statvfs(root)
                ds[name]['percent'] = round(100 * stat.f_bfree / stat.f_blocks)
                # unit is MiB
                ds[name]['free'] = round(
                    (stat.f_bfree * stat.f_bsize) / (1024**2))

            except Exception as e:
                ds[name]['free'] = 0
                ds[name]['percent'] = 0
                logger.exception(
                    "monitor cannot compute disk space with name {} on {}".
                    format(name, root))

        # loads
        try:
            uptime_output = subprocess.check_output('uptime').decode().strip()
            end_of_line = uptime_output.split(':')[-1]
            floads = end_of_line.split(', ')
            load1, load5, load15 = [round(100 * float(x)) for x in floads]

        except Exception as e:
            load1, load5, load15 = 0, 0, 0
            logger.exception("monitor cannot compute cpu loads")

        # run the whole stuff
        asyncio.get_event_loop().run_until_complete(asyncio.gather(*futures))
        # write results
        for coursename, figures in figures_by_course.items():
            student_homes = CourseDir(coursename).student_homes()
            Stats(coursename).record_monitor_counts(
                figures.running_containers,
                figures.frozen_containers,
                figures.running_kernels,
                student_homes,
                load1,
                load5,
                load15,
                ds['docker']['percent'],
                ds['docker']['free'],
                ds['nbhosting']['percent'],
                ds['nbhosting']['free'],
                ds['system']['percent'],
                ds['system']['free'],
            )