Example #1
0
 def _write_results(self, figures_by_course, disk_spaces, loads, memory):
     coursedirs_by_name = {c.coursename: c for c in CourseDir.objects.all()}
     # write results
     for coursename, figures in figures_by_course.items():
         nb_student_homes = coursedirs_by_name[coursename].nb_student_homes(
         )
         Stats(coursename).record_monitor_counts(
             figures.running_containers,
             figures.frozen_containers,
             figures.running_kernels,
             nb_student_homes,
             loads['load1'],
             loads['load5'],
             loads['load15'],
             disk_spaces['container']['percent'],
             disk_spaces['container']['free'],
             disk_spaces['nbhosting']['percent'],
             disk_spaces['nbhosting']['free'],
             disk_spaces['system']['percent'],
             disk_spaces['system']['free'],
             memory['memory_total'],
             memory['memory_free'],
             memory['memory_available'],
             self.system_containers,
             self.system_kernels,
         )
Example #2
0
 async def co_run(self, grace):
     nbhroot = Path(sitesettings.nbhroot)
     # stopped containers are useful only for statistics
     if self.container.status != 'running':
         self.figures.count_container(False)
         return
     # count number of kernels and last activity
     await self.count_running_kernels()
     # last_activity may be 0 if no kernel is running inside that container
     # or None if we could not determine it properly
     if self.last_activity is None:
         logger.error(
             "skipping container {} with no known last_activity".format(
                 self.name))
         return
     # check there has been activity in the last <grace> seconds
     now = time.time()
     grace_past = now - grace
     idle_minutes = (now - self.last_activity) // 60
     if self.last_activity > grace_past:
         logger.debug("sparing {} that had activity {}' ago".format(
             self, idle_minutes))
         self.figures.count_container(True, self.nb_kernels)
     else:
         if self.last_activity:
             logger.info("{} has been idle for {} mn - killing".format(
                 self, idle_minutes))
         else:
             logger.info("{} has no kernel attached - killing".format(self))
         # kill it
         self.container.kill()
         # if that container does not run the expected image hash
         # it is because the course image was upgraded in the meanwhile
         # then we even remove the container so it will get re-created
         # next time with the right image this time
         actual_hash = self.container.image.id
         if actual_hash != self.hash:
             logger.info(
                 "removing container {} - has hash {} instead of expected {}"
                 .format(self.name, actual_hash[:15], self.hash[:15]))
             self.container.remove(v=True)
         # this counts for one dead container
         self.figures.count_container(False)
         # keep track or that removal in events.raw
         Stats(self.course).record_kill_jupyter(self.student)
Example #3
0
    def run_forever(self):
        tick = time.time()

        # one cycle can take some time as all the jupyters need to be http-probed
        # so let us compute the actual time to wait
        logger.info("nbh-monitor is starting up")
        for c in CourseDir.objects.all():
            Stats(c.coursename).record_monitor_known_counts_line()
        while True:
            try:
                self.run_once()
            # just be extra sure it doesn't crash
            except Exception:
                logger.exception(f"Unexpected error")
            tick += self.period
            duration = max(0, int(tick - time.time()))
            logger.info(f"monitor is waiting for {duration}s")
            time.sleep(duration)
Example #4
0
    def run_forever(self):
        tick = time.time()

        # one cycle can take some time as all the jupyters need to be http-probed
        # so let us compute the actual time to wait
        logger.info("nbh-monitor is starting up")
        coursenames = CoursesDir().coursenames()
        for coursename in coursenames:
            Stats(coursename).record_monitor_known_counts_line()
        while True:
            try:
                self.run_once()
            # just be extra sure it doesn't crash
            except Exception as e:
                logger.exception(
                    "protecting against unexpected exception {}".format(e))
            tick += self.period
            duration = max(0, int(tick - time.time()))
            logger.info("monitor is waiting for {}s".format(duration))
            time.sleep(duration)
Example #5
0
def _open_notebook(request, coursename, student, notebook,
                   *, forcecopy, init_student_git): # pylint: disable=r0914
    """
    implement both edx_request and classroom_request
    that behave almost exactly the same
    """
    ok, explanation = authorized(request)

    if not ok:
        return HttpResponseForbidden(
            f"Access denied: {explanation}")

    coursedir = CourseDir.objects.get(coursename=coursename)
    if not coursedir.is_valid():
        return error_page(
            request, coursename, student, notebook,
            f"no such course `{coursename}'", header=True,
        )

    # the ipynb extension is removed from the notebook name in urls.py
    exists, notebook_with_ext, _, is_genuine_notebook = \
        locate_notebook(coursedir.git_dir, notebook)

    # second attempt from the student's space
    # in case the student has created it locally...
    if not exists:
        exists, notebook_with_ext, _, is_genuine_notebook = \
            locate_notebook(coursedir.student_dir(student), notebook)

    if not exists:
        msg = f"notebook `{notebook}' not known in this course or student"
        return error_page(request, coursename, student, notebook,
                          msg, header="notebook not found")


    # deal with concurrent requests on the same container
    # by using a shared memory (a redis cache)
    # starting_containers is the cache name
    # as configured in nbhosting.ini(.in)

    # in devel mode we don't have redis
    redis_cache = None
    try:

        import redis
        idling = 0.5
        # just a safety in case our code would not release stuff properly
        expire_in_s = 15

        def my_repr(timedelta):
            return f"{timedelta.seconds}s {timedelta.microseconds}µs"

        redis_cache = redis.Redis()
        container = f'{coursename}-x-{student}'
        for attempt in itertools.count(1):
            already = redis_cache.get(container)

            # good to go
            if not already:
                logger.info(f"{attempt=} going ahead with {container=} and {notebook=}")
                redis_cache.set(container, b'1')
                redis_cache.expire(container, expire_in_s)
                break

            # has the stored token expired ?
            logger.info(f"{attempt=} waiting for {idling=} because {container} is being started"
                        f"with {container=} and {notebook=}")
            time.sleep(idling)
    except ModuleNotFoundError:
        # make sure this error does not go unnoticed in production
        if not DEBUG:
            raise
        else:
            pass


    subcommand = 'container-view-student-course-notebook'

    # build command
    command = ['nbh', '-d', sitesettings.nbhroot]
    if DEBUG:
        command.append('-x')
    command.append(subcommand)
    # propagate the forcecopy flag for reset_from_origin
    if forcecopy:
        command.append('-f')
    # propagate that a git initialization was requested
    # forcecopy has no effect in this case
    if init_student_git:
        command.append('-g')
        # a student repo gets cloned from local course git
        # for lower delays when updating, and removing issues
        # like accessing private repos from the students space
        ref_giturl = str(coursedir.git_dir)
    else:
        ref_giturl = coursedir.giturl

    # add arguments to the subcommand
    command += [student, coursename, notebook_with_ext,
                coursedir.image, ref_giturl]
    command_str = " ".join(command)
    logger.info(f'edxfront is running (DEBUG={DEBUG}): {command_str}')
    completed = subprocess.run(
        command, universal_newlines=True,
        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    log_completed_process(completed, subcommand)

    try:
        action, _container_name, actual_port, jupyter_token = completed.stdout.split()

        if completed.returncode != 0 or action.startswith("failed"):
            message = failed_command_message(
                command_str, completed, prefix="failed to spawn notebook container")
            header = failed_command_header(action)
            return error_page(
                request, coursename, student, notebook, message, header)

        # remember that in events file for statistics
        Stats(coursename).record_open_notebook(student, notebook, action, actual_port)
        # redirect with same proto (http or https) as incoming
        scheme = request.scheme
        # get the host part of the incoming URL
        host = request.get_host()
        # remove initial port if present in URL
        if ':' in host:
            host, _ = host.split(':', 1)
        ########## forge a URL that nginx will intercept
        # passing along course and student is for 'reset_from_origin'
        if is_genuine_notebook:
            url = (f"{scheme}://{host}/{actual_port}/notebooks/"
                   f"{notebook_with_ext}?token={jupyter_token}&"
                   f"course={coursename}&student={student}")
        else:
            url = (f"{scheme}://{host}/{actual_port}/lab/tree/{notebook_with_ext}")
        logger.info(f"edxfront: redirecting to {url}")
        return HttpResponseRedirect(url)

    except Exception as exc:
        prefix = (f"exception when parsing output of nbh {subcommand}\n"
                   f"{type(exc)}: {exc}")
        message = failed_command_message(command_str, completed, prefix=prefix)
        return error_page(
            request, coursename, student, notebook, message)
    finally:
        if redis_cache:
            redis_cache.delete(container)
Example #6
0
def send_material_usage(request, course):
    stats = Stats(course)
    encoded = json.dumps(stats.material_usage())
    response = HttpResponse(encoded, content_type="application/json")
    response['Access-Control-Allow-Origin'] = '*'
    return response
Example #7
0
    def run_once(self):

        # initialize all known courses - we want data on courses
        # even if they don't run any container yet
        logger.debug("scanning courses")
        coursesdir = CoursesDir()
        coursenames = coursesdir.coursenames()
        figures_by_course = {
            coursename: CourseFigures()
            for coursename in coursenames
        }

        try:
            proxy = docker.from_env(version='auto')
            logger.debug("scanning containers")
            containers = proxy.containers.list(all=True)
            hash_by_course = {
                coursename: CourseDir(coursename).image_hash(proxy)
                for coursename in coursenames
            }
        except Exception as e:
            logger.exception(
                "Cannot gather containers list at the docker daemon - skipping"
            )
            return

        # a list of async futures
        futures = []
        for container in containers:
            try:
                name = container.name
                # too much spam ven in debug mode
                # logger.debug("dealing with container {}".format(name))
                coursename, student = name.split('-x-')
                figures_by_course.setdefault(coursename, CourseFigures())
                figures = figures_by_course[coursename]
                # may be None if s/t is misconfigured
                hash = hash_by_course[coursename] \
                       or "hash not found for course {}".format(coursename)
                monitored_jupyter = MonitoredJupyter(container, coursename,
                                                     student, figures, hash)
                futures.append(monitored_jupyter.co_run(self.grace))
            # typically non-nbhosting containers
            except ValueError as e:
                # ignore this container as we don't even know
                # in what course it
                logger.info("ignoring non-nbhosting {}".format(container))
            except Exception as e:
                logger.exception(
                    "ignoring {} in monitor - unexpected exception".format(
                        container))
        # ds stands for disk_space
        docker_root = proxy.info()['DockerRootDir']
        nbhroot = sitesettings.nbhroot
        system_root = "/"
        ds = {}
        for name, root in (
            ('docker', docker_root),
            ('nbhosting', nbhroot),
            ('system', system_root),
        ):
            ds[name] = {}
            try:
                stat = os.statvfs(root)
                ds[name]['percent'] = round(100 * stat.f_bfree / stat.f_blocks)
                # unit is MiB
                ds[name]['free'] = round(
                    (stat.f_bfree * stat.f_bsize) / (1024**2))

            except Exception as e:
                ds[name]['free'] = 0
                ds[name]['percent'] = 0
                logger.exception(
                    "monitor cannot compute disk space with name {} on {}".
                    format(name, root))

        # loads
        try:
            uptime_output = subprocess.check_output('uptime').decode().strip()
            end_of_line = uptime_output.split(':')[-1]
            floads = end_of_line.split(', ')
            load1, load5, load15 = [round(100 * float(x)) for x in floads]

        except Exception as e:
            load1, load5, load15 = 0, 0, 0
            logger.exception("monitor cannot compute cpu loads")

        # run the whole stuff
        asyncio.get_event_loop().run_until_complete(asyncio.gather(*futures))
        # write results
        for coursename, figures in figures_by_course.items():
            student_homes = CourseDir(coursename).student_homes()
            Stats(coursename).record_monitor_counts(
                figures.running_containers,
                figures.frozen_containers,
                figures.running_kernels,
                student_homes,
                load1,
                load5,
                load15,
                ds['docker']['percent'],
                ds['docker']['free'],
                ds['nbhosting']['percent'],
                ds['nbhosting']['free'],
                ds['system']['percent'],
                ds['system']['free'],
            )
Example #8
0
 async def co_run(self, idle, unused):
     """
     both timeouts in seconds
     """
     now = time.time()
     actual_hash = self.container.image.id
     # stopped containers need to be handled a bit differently
     if self.container.status != 'running':
         if actual_hash != self.image_hash:
             logger.info(f"Removing (stopped & outdated) {self} "
                         f"that has outdated hash {actual_hash[:15]} "
                         f"vs expected {self.image_hash[:15]}")
             self.container.remove(v=True)
         else:
             exited_time = self.exited_time()
             unused_days = (int)((now - exited_time) // (24 * 3600))
             unused_hours = (int)((now - exited_time) // (3600) % 24)
             if (now - exited_time) > unused:
                 logger.info(f"Removing (stopped & unused) {self} "
                             f"that has been unused for {unused_days} days "
                             f"{unused_hours} hours")
                 self.container.remove(v=True)
             else:
                 logger.debug(
                     f"Ignoring stopped {self} that "
                     f"exited {unused_days} days {unused_hours} hours ago")
                 self.figures.count_container(False)
         return
     # count number of kernels and last activity
     await self.count_running_kernels()
     # last_activity may be 0 if no kernel is running inside that container
     # or None if we could not determine it properly
     if self.last_activity is None:
         logger.error(
             f"Skipping running {self} with no known last_activity")
         return
     # check there has been activity in the last grace_idle_in_minutes
     idle_minutes = (int)((now - self.last_activity) // 60)
     if (now - self.last_activity) < idle:
         logger.debug(
             f"Sparing running {self} that had activity {idle_minutes} mn ago"
         )
         self.figures.count_container(True, self.nb_kernels)
     else:
         if self.last_activity:
             logger.info(f"Killing (running & idle) {self} "
                         f"that has been idle for {idle_minutes} mn")
         else:
             logger.info(f"Killing (running and empty){self} "
                         f"that has no kernel attached")
         # kill it
         self.container.kill()
         # keep track or that removal in events.raw
         Stats(self.course).record_kill_jupyter(self.student)
         # if that container does not run the expected image hash
         # it is because the course image was upgraded in the meanwhile
         # then we even remove the container so it will get re-created
         # next time with the right image this time
         if actual_hash != self.image_hash:
             logger.info(f"Removing (just killed & outdated) {self} "
                         f"that has outdated hash {actual_hash[:15]} "
                         f"vs expected {self.image_hash[:15]}")
             self.container.remove(v=True)
         else:
             # this counts for one dead container
             self.figures.count_container(False)
Example #9
0
def send_daily_metrics(request, course):
    stats = Stats(course)
    encoded = json.dumps(stats.daily_metrics())
    return HttpResponse(encoded, content_type = "application/json")
Example #10
0
def send_material_usage(request, course):
    stats = Stats(course)
    encoded = json.dumps(stats.material_usage())
    return HttpResponse(encoded, content_type = "application/json")
Example #11
0
def send_monitor_counts(request, course):
    stats = Stats(course)
    encoded = json.dumps(stats.monitor_counts())
    return HttpResponse(encoded, content_type = "application/json")
Example #12
0
def edx_request(request, course, student, notebook):
    """
    the main edxfront entry point; it
    * creates a student if needed
    * copies the notebook if needed
    * makes sure the student container is ready to answer http requests
    and then returns a http redirect to /port/<notebook_path>
    """

    if not authorized(request):
        return HttpResponseForbidden()

    # the ipynb extension is removed from the notebook name in urls.py
    notebook_withext = notebook + ".ipynb"
    # have we received a request to force the copy (for reset_from_origin)
    forcecopy = request.GET.get('forcecopy', False)

    subcommand = 'docker-view-student-course-notebook'

    # build command
    command = ['nbh', '-d', sitesettings.nbhroot]
    if DEBUG:
        command.append('-x')
    command.append(subcommand)
    # propagate the forcecopy flag for reset_from_origin
    if forcecopy:
        command.append('-f')

    # add arguments to the subcommand
    command += [student, course, notebook_withext]
    logger.info("In {}\n-> Running command {}".format(Path.cwd(),
                                                      " ".join(command)))
    completed_process = subprocess.run(command,
                                       universal_newlines=True,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
    log_completed_process(completed_process, subcommand)

    if completed_process.returncode != 0:
        message = "command {} returned {}\nstderr:{}"\
                  .format(" ".join(command),
                          completed_process.returncode,
                          completed_process.stderr)
        return error_page(request, course, student, notebook, message)

    try:
        action, docker_name, actual_port, jupyter_token = completed_process.stdout.split(
        )

        if action.startswith("failed"):
            message = ("failed to spawn notebook container\n"
                       "command {}\nreturned with retcod={} action={}\n"
                       "stdout:{}\n"
                       "stderr:{}").format(" ".join(command),
                                           completed_process.returncode,
                                           action, completed_process.stdout,
                                           completed_process.stderr)
            return error_page(request, course, student, notebook, message)

        # remember that in events file for statistics
        Stats(course).record_open_notebook(student, notebook, action,
                                           actual_port)
        # redirect with same proto (http or https) as incoming
        scheme = request.scheme
        # get the host part of the incoming URL
        host = request.get_host()
        # remove initial port if present in URL
        if ':' in host:
            host, _ = host.split(':', 1)
        ########## forge a URL that nginx will intercept
        # port depends on scheme - we do not specify it
        # passing along course and student is for 'reset_from_origin'
        url = "{scheme}://{host}/{port}/notebooks/{path}?token={token}&course={course}&student={student}"\
              .format(scheme=scheme, host=host, port=actual_port,
                      path=notebook_withext, token=jupyter_token,
                      course=course, student=student)
        logger.info("edxfront: redirecting to {}".format(url))
        #        return HttpResponse('<a href="{}">click to be redirected</h1>'.format(url))
        return HttpResponseRedirect(url)

    except Exception as e:
        message = "exception when parsing output of nbh {}\n{}\n{}"\
                  .format(subcommand, completed_process.stdout, e)
        return error_page(request, course, student, notebook, message)
Example #13
0
def _open_notebook(request, coursename, student, notebook,
                   *, forcecopy, init_student_git): # pylint: disable=r0914
    """
    implement both edx_request and classroom_request
    that behave almost exactly the same
    """
    ok, explanation = authorized(request)

    if not ok:
        return HttpResponseForbidden(
            f"Access denied: {explanation}")

    coursedir = CourseDir.objects.get(coursename=coursename)
    if not coursedir.is_valid():
        return error_page(
            request, coursename, student, notebook,
            f"no such course `{coursename}'", header=True,
        )

    # the ipynb extension is removed from the notebook name in urls.py
    exists, notebook_with_ext, _, is_genuine_notebook = \
        locate_notebook(coursedir.git_dir, notebook)

    # second attempt from the student's space
    # in case the student has created it locally...
    if not exists:
        exists, notebook_with_ext, _, is_genuine_notebook = \
            locate_notebook(coursedir.student_dir(student), notebook)

    if not exists:
        msg = f"notebook `{notebook}' not known in this course or student"
        return error_page(request, coursename, student, notebook,
                          msg, header="notebook not found")

    subcommand = 'container-view-student-course-notebook'

    # build command
    command = ['nbh', '-d', sitesettings.nbhroot]
    if DEBUG:
        command.append('-x')
    command.append(subcommand)
    # propagate the forcecopy flag for reset_from_origin
    if forcecopy:
        command.append('-f')
    # propagate that a git initialization was requested
    # forcecopy has no effect in this case
    if init_student_git:
        command.append('-g')
        # a student repo gets cloned from local course git
        # for lower delays when updating, and removing issues
        # like accessing private repos from the students space
        ref_giturl = str(coursedir.git_dir)
    else:
        ref_giturl = coursedir.giturl

    # add arguments to the subcommand
    command += [student, coursename, notebook_with_ext,
                coursedir.image, ref_giturl]
    command_str = " ".join(command)
    logger.info(f'edxfront is running: {command_str} DEBUG={DEBUG}')
    completed = subprocess.run(
        command, universal_newlines=True,
        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    log_completed_process(completed, subcommand)

    try:
        action, _container_name, actual_port, jupyter_token = completed.stdout.split()

        if completed.returncode != 0 or action.startswith("failed"):
            message = failed_command_message(
                command_str, completed, prefix="failed to spawn notebook container")
            header = failed_command_header(action)
            return error_page(
                request, coursename, student, notebook, message, header)

        # remember that in events file for statistics
        Stats(coursename).record_open_notebook(student, notebook, action, actual_port)
        # redirect with same proto (http or https) as incoming
        scheme = request.scheme
        # get the host part of the incoming URL
        host = request.get_host()
        # remove initial port if present in URL
        if ':' in host:
            host, _ = host.split(':', 1)
        ########## forge a URL that nginx will intercept
        # passing along course and student is for 'reset_from_origin'
        if is_genuine_notebook:
            url = (f"{scheme}://{host}/{actual_port}/notebooks/"
                   f"{notebook_with_ext}?token={jupyter_token}&"
                   f"course={coursename}&student={student}")
        else:
            url = (f"{scheme}://{host}/{actual_port}/lab/tree/{notebook_with_ext}")
        logger.info(f"edxfront: redirecting to {url}")
        return HttpResponseRedirect(url)

    except Exception as exc:
        prefix = (f"exception when parsing output of nbh {subcommand}\n"
                   f"{type(exc)}: {exc}")
        message = failed_command_message(command_str, completed, prefix=prefix)
        return error_page(
            request, coursename, student, notebook, message)