async def _handle_new_job(self, message: BackendNewJob): self._logger.info("Received request for jobid %s", message.job_id) # For send_job_result internal checks self.__running_job[message.job_id] = False # no ssh info sent # Tell the backend we started running the job await ZMQUtils.send(self.__backend_socket, AgentJobStarted(message.job_id)) try: if message.environment not in self.environments: self._logger.warning("Task %s/%s ask for an unknown environment %s (not in aliases)", message.course_id, message.task_id, message.environment) raise CannotCreateJobException('This environment is not available in this agent. Please contact your course administrator.') task_fs = self._tasks_filesystem.from_subfolder(message.course_id).from_subfolder(message.task_id) if not task_fs.exists(): self._logger.warning("Task %s/%s unavailable on this agent", message.course_id, message.task_id) raise CannotCreateJobException('Task unavailable on agent. Please retry later, the agents should synchronize soon. If the error ' 'persists, please contact your course administrator.') # Let the subclass run the job await self.new_job(message) except CannotCreateJobException as e: await self.send_job_result(message.job_id, "crash", e.message) except TooManyCallsException: self._logger.exception("TooManyCallsException in new_job") await self.send_job_result(message.job_id, "crash", "An unknown error occured in the agent. Please contact your course " "administrator.") except JobNotRunningException: self._logger.exception("JobNotRunningException in new_job") except: self._logger.exception("Unknown exception in new_job") await self.send_job_result(message.job_id, "crash", "An unknown error occured in the agent. Please contact your course " "administrator.")
async def handle_new_job(self, message: BackendNewJob): """ Handles a new job: starts the grading container """ try: self._logger.info("Received request for jobid %s", message.job_id) course_id = message.course_id task_id = message.task_id debug = message.debug environment_name = message.environment enable_network = message.enable_network time_limit = message.time_limit hard_time_limit = message.hard_time_limit or time_limit * 3 mem_limit = message.mem_limit task_fs = self.tasks_fs.from_subfolder(course_id).from_subfolder( task_id) if not task_fs.exists(): self._logger.warning("Task %s/%s unavailable on this agent", course_id, task_id) await self.send_job_result( message.job_id, "crash", 'Task unavailable on agent. Please retry later, the agents should synchronize soon. If the error ' 'persists, please contact your course administrator.') return # Check for realistic memory limit value if mem_limit < 20: mem_limit = 20 elif mem_limit > self._max_memory_per_slot: self._logger.warning( "Task %s/%s ask for too much memory (%dMB)! Available: %dMB", course_id, task_id, mem_limit, self._max_memory_per_slot) await self.send_job_result( message.job_id, "crash", 'Not enough memory on agent (available: %dMB). Please contact your course administrator.' % self._max_memory_per_slot) return if environment_name not in self._containers: self._logger.warning( "Task %s/%s ask for an unknown environment %s (not in aliases)", course_id, task_id, environment_name) await self.send_job_result( message.job_id, "crash", 'Unknown container. Please contact your course administrator.' ) return environment = self._containers[environment_name]["id"] # Handle ssh debugging ssh_port = None if debug == "ssh": # allow 30 minutes of real time. time_limit = 30 * 60 hard_time_limit = 30 * 60 # select a port if len(self.ssh_ports) == 0: self._logger.warning( "User asked for an ssh debug but no ports are available" ) await self.send_job_result( message.job_id, "crash", 'No ports are available for SSH debug right now. Please retry later.' ) return ssh_port = self.ssh_ports.pop() # Create directories for storing all the data for the job try: container_path = tempfile.mkdtemp(dir=self.tmp_dir) except Exception as e: self._logger.error("Cannot make container temp directory! %s", str(e), exc_info=True) await self.send_job_result( message.job_id, "crash", 'Cannot make container temp directory.') if ssh_port is not None: self.ssh_ports.add(ssh_port) return task_path = os.path.join(container_path, 'task') # tmp_dir/id/task/ sockets_path = os.path.join(container_path, 'sockets') # tmp_dir/id/socket/ student_path = os.path.join(task_path, 'student') # tmp_dir/id/task/student/ systemfiles_path = os.path.join( task_path, 'systemfiles') # tmp_dir/id/task/systemfiles/ # Create the needed directories os.mkdir(sockets_path) os.chmod(container_path, 0o777) os.chmod(sockets_path, 0o777) # TODO: avoid copy await self._loop.run_in_executor( None, lambda: task_fs.copy_from(None, task_path)) os.chmod(task_path, 0o777) if not os.path.exists(student_path): os.mkdir(student_path) os.chmod(student_path, 0o777) # Run the container try: container_id = await self._loop.run_in_executor( None, lambda: self._docker.create_container( environment, enable_network, mem_limit, task_path, sockets_path, ssh_port)) except Exception as e: self._logger.warning("Cannot create container! %s", str(e), exc_info=True) await self.send_job_result(message.job_id, "crash", 'Cannot create container.') await self._loop.run_in_executor( None, lambda: rmtree(container_path)) if ssh_port is not None: self.ssh_ports.add(ssh_port) return # Store info future_results = asyncio.Future() self._containers_running[ container_id] = message, container_path, future_results self._container_for_job[message.job_id] = container_id self._student_containers_for_job[message.job_id] = set() if ssh_port is not None: self.running_ssh_debug[container_id] = ssh_port try: # Start the container await self._loop.run_in_executor( None, lambda: self._docker.start_container(container_id)) except Exception as e: self._logger.warning("Cannot start container! %s", str(e), exc_info=True) await self.send_job_result(message.job_id, "crash", 'Cannot start container') await self._loop.run_in_executor( None, lambda: rmtree(container_path)) if ssh_port is not None: self.ssh_ports.add(ssh_port) return # Talk to the container self._loop.create_task( self.handle_running_container( message.job_id, container_id, message.inputdata, debug, ssh_port, environment_name, mem_limit, time_limit, hard_time_limit, sockets_path, student_path, systemfiles_path, future_results)) # Ask the "cgroup" thread to verify the timeout/memory limit await ZMQUtils.send( self._killer_watcher_push.get_push_socket(), KWPRegisterContainer(container_id, mem_limit, time_limit, hard_time_limit)) # Tell the backend/client the job has started await ZMQUtils.send(self._backend_socket, AgentJobStarted(message.job_id)) except: self._logger.exception("Exception in handle_new_job")