def post(self, *args, **kwargs): assignment_id = self.get_assignment_id(**kwargs) config_dao = daos.AssignmentConfigDao(self.settings) config = config_dao.find_by_id(assignment_id) if not config: self.abort({"message": "assignment configuration not found"}) return if not self._assert_run_valid(config): # abort in valid-run method for conciseness return run_attrs = { **self.body, "assignment_id": assignment_id, "started_at": get_time(), "state": models.GradingRunState.READY, "student_jobs_left": len(self.body.get("students_env")), } run = models.GradingRun(**run_attrs) run_dao = daos.GradingRunDao(self.settings) run.id = str(run_dao.insert(run).inserted_id) if not continue_grading_run(self.settings, run): self.abort({"message": "failed to start grading run"}, status=500) return # trigger schedule event tornado.ioloop.IOLoop.current().add_callback(worker_schedule_job, self.settings) return {"grading_run_id": run.id}
def _handle_lost_worker_node(settings, worker, reason="timeout"): lost_run_id = worker.running_job_id worker.is_alive = False worker.running_job_id = None worker_dao = WorkerNodeDao(settings) worker_dao.update(worker) if not lost_run_id: logger.critical( "worker '{}' went offline unexpectedly on '{}' due to {}".format( worker.id, worker.hostname, reason)) return logger.critical("worker '{}' went offline unexpectedly on '{}' while" " executing '{}' due to {}".format(worker.id, worker.hostname, lost_run_id, reason)) jobs_dao = GradingJobDao(settings) job = jobs_dao.find_by_id(lost_run_id) if job is None: logger.critical(("worker was reportedly executing job '{}' " "but this job does not exist").format(lost_run_id)) return job.finished_at = get_time() job.success = False job.results = [{"result": "worker died while executing job"}] jobs_dao.update(job) tornado.ioloop.IOLoop.current().add_callback(job_update_callback, settings, lost_run_id, job.run_id)
def post(self, *args, **kwargs): worker_id = kwargs.get("worker_id") hostname = self.body.get("hostname") worker_node_dao = daos.WorkerNodeDao(self.settings) worker_node = models.WorkerNode(id_=worker_id, hostname=hostname, last_seen=get_time(), is_alive=True) dup = worker_node_dao.find_by_id(worker_id) if dup is None: logger.info("new worker {} joined on {}".format( worker_id, hostname)) worker_node_dao.insert(worker_node) elif not dup.is_alive: dup.is_alive = True logger.info("worker {} alive again on {}".format( worker_id, hostname)) worker_node_dao.update(dup) else: msg = "worker id '{}' already exists".format(worker_id) logger.info(msg) self.abort({"message": msg}, status=400) return return {"heartbeat": self.get_flags()["heartbeat_interval"]}
def handler_register(self, hostname): if self.worker_id is None: return worker_node_dao = daos.WorkerNodeDao(self.settings) dup = worker_node_dao.find_by_id(self.worker_id) if dup is None: self.worker_node = models.WorkerNode( id_=self.worker_id, hostname=hostname, last_seen=get_time(), is_alive=True, use_ws=True, ) logger.info("new worker '{}' joined on '{}'".format( self.worker_id, hostname)) worker_node_dao.insert(self.worker_node) elif not dup.is_alive: self.worker_node = dup self.worker_node.hostname = hostname self.worker_node.last_seen = get_time() self.worker_node.is_alive = True self.use_ws = True logger.info("worker '{}' alive again on '{}'".format( self.worker_id, hostname)) worker_node_dao.update(self.worker_node) else: msg = "worker id '{}' already exists".format(self.worker_id) logger.info(msg) self.send({"success": False}) self.close(reason=msg, code=1002) return self.registered = True self.get_ws_conn_map()[self.worker_id] = self self.send({"success": True}) # trigger schedule event tornado.ioloop.IOLoop.current().add_callback(worker_schedule_job, self.settings)
def fail_grading_run(settings, run): run_dao = daos.GradingRunDao(settings) if run is None: logger.critical("cannot fail non-existent run with ID '{}'".format( run.id)) return run.finished_at = get_time() run.state = GradingRunState.FAILED run.success = False run_dao.update(run)
def post(self, *args, **kwargs): """ Allows workers to update grading job status on completion """ worker_id = kwargs.get("worker_id") job_id = self.body.get("grading_job_id") grading_job_dao = daos.GradingJobDao(self.settings) job = grading_job_dao.find_by_id(job_id) if not job: self.abort({"message": "job with the given ID not found"}) return job_state = job.get_state() if job_state != models.GradingJobState.STARTED: logger.critical( "job with id '{}' updated when in state '{}'".format( job_id, job_state.value)) self.abort( {"message": "cannot update job that is not in STARTED state"}) return worker_node_dao = daos.WorkerNodeDao(self.settings) worker_node = worker_node_dao.find_by_id(worker_id) if not worker_node: logger.critical( "unknown node with ID '{}' successfully updated job".format( worker_id)) self.abort({"message": ""}, status=404) return # clear the worker node's job worker_node.running_job_id = None worker_node.is_alive = True worker_node_dao.update(worker_node) # finish the job job.finished_at = get_time() job.results = self.body.get("results") job.success = self.body.get("success") grading_job_dao.update(job) # store the logs job_log_dao = daos.GradingJobLogDao(self.settings) job_log = models.GradingJobLog(job_id=job_id, **self.body.get("logs")) job_log_dao.insert(job_log) # thread safe callback tornado.ioloop.IOLoop.current().add_callback(job_update_callback, self.settings, job_id, job.run_id)
def post(self, *args, **kwargs): worker_id = kwargs.get("worker_id") worker_node_dao = daos.WorkerNodeDao(self.settings) worker_node = worker_node_dao.find_by_id(worker_id) if not worker_node: logger.critical( "unknown node with ID '{}' successfully sent heartbeat".format( worker_id)) self.abort({"message": ""}, status=404) return worker_node.last_seen = get_time() worker_node.is_alive = True worker_node_dao.update(worker_node)
def _prepare_next_job(settings, grading_run, global_job_environ, runtime_job_environ, job_stages, job_type): """ Prepares a job to be submitted to queue """ grading_job_dao = daos.GradingJobDao(settings) grading_job = models.GradingJob(job_type=job_type, run_id=grading_run.id, queued_at=get_time()) grading_job.id = str(grading_job_dao.insert(grading_job).inserted_id) runtime_job_environ["GRADING_JOB_ID"] = grading_job.id grading_job.set_stages(job_stages, global_job_environ, runtime_job_environ) grading_job_dao.update(grading_job) return grading_job.id
def worker_schedule_job(settings): conn_map = settings["WS_CONN_MAP"] job_queue = settings["QUEUE"] stream_queue = settings["STREAM_QUEUE"] grading_job_dao = GradingJobDao(settings) worker_node_dao = WorkerNodeDao(settings) idle_workers = worker_node_dao.find_by_idleness() random.shuffle(idle_workers) for idle_worker in idle_workers: if idle_worker.use_ws and idle_worker.id in conn_map: conn = conn_map[idle_worker.id] try: grading_job_id = job_queue.pull() job_queue.update_all_job_positions(stream_queue) grading_job = grading_job_dao.find_by_id(grading_job_id) if not grading_job: logger.critical( "found job ID '{}' in queue, but job does not exist". format(grading_job_id)) return grading_job.started_at = get_time() grading_job.worker_id = idle_worker.id grading_job_dao.update(grading_job) idle_worker.running_job_id = grading_job_id idle_worker.jobs_processed += 1 worker_node_dao.update(idle_worker) conn.send({ "grading_job_id": grading_job_id, "stages": grading_job.stages }) except Empty: # no more jobs available return except Exception as e: logger.critical("failed to assign job to {}: {}".format( idle_worker.id, repr(e)))
def worker_heartbeat_callback(settings): """ Checks if any workers went offline (after 2 * heartbeat_interval seconds) """ heartbeat_timestamp = get_time() heartbeat_interval = settings["FLAGS"]["heartbeat_interval"] conn_map = settings["WS_CONN_MAP"] dao = WorkerNodeDao(settings) for node in dao.find_by_liveness(alive=True): if (heartbeat_timestamp - node.last_seen).total_seconds() >= 2 * heartbeat_interval: if node.use_ws and node.id in conn_map: conn_map[node.id].close() _handle_lost_worker_node(settings, node)
def get(self, *args, **kwargs): """ Allows workers to request their next grading job """ worker_id = kwargs.get("worker_id") worker_node_dao = daos.WorkerNodeDao(self.settings) worker_node = worker_node_dao.find_by_id(worker_id) if not worker_node: logger.critical( "unknown node with ID '{}' successfully requested job".format( worker_id)) self.abort({"message": ""}, status=404) return try: grading_job_id = self.get_queue().pull() self.get_stream_queue().update_job_state( grading_job_id, models.GradingJobState.STARTED.name) self.get_queue().update_all_job_positions(self.get_stream_queue()) grading_job_dao = daos.GradingJobDao(self.settings) grading_job = grading_job_dao.find_by_id(grading_job_id) if not grading_job: logger.critical( "found job ID '{}' in queue, but job does not exist". format(grading_job_id)) self.abort( {"message": "a failure occurred while getting next job"}, status=500) return grading_job.started_at = get_time() grading_job.worker_id = worker_id grading_job_dao.update(grading_job) worker_node.running_job_id = grading_job_id worker_node.jobs_processed += 1 worker_node.is_alive = True worker_node_dao.update(worker_node) return { "grading_job_id": grading_job_id, "stages": grading_job.stages } except Empty: self.abort({"message": "no jobs available"}, status=498)
def on_ping(self, data): # ping messages have the same function as heartbeat requests # for normal http workers if self.worker_id is None: logger.critical("worker is not initialized") return worker_node_dao = daos.WorkerNodeDao(self.settings) worker_node = worker_node_dao.find_by_id(self.worker_id) if not worker_node: logger.critical( "unknown ws node with ID '{}' successfully sent heartbeat". format(self.worker_id)) return worker_node.last_seen = get_time() worker_node_dao.update(worker_node)
def _finish_grading_run(settings, grading_run): grading_run_dao = daos.GradingRunDao(settings) grading_run.state = GradingRunState.FINISHED grading_run.finished_at = get_time() grading_run.success = True grading_run_dao.update(grading_run)
def handler_job_result(self, grading_job_id, success, results, logs): if not self.registered: logger.info("worker '{}' submitted before registering".format( self.worker_id)) self.close(reason="submitting before registering", code=1002) return grading_job_dao = daos.GradingJobDao(self.settings) job = grading_job_dao.find_by_id(grading_job_id) if not job: self.close(reason="job with the given ID not found", code=1002) return job_state = job.get_state() if job_state != models.GradingJobState.STARTED: logger.critical( "job with id '{}' updated when in state '{}'".format( grading_job_id, job_state.value)) self.close(reason="cannot update job that is not in STARTED state", code=1002) return worker_node_dao = daos.WorkerNodeDao(self.settings) worker_node = worker_node_dao.find_by_id(self.worker_id) if not worker_node: msg = "unknown worker '{}' successfully updated job".format( self.worker_id) logger.critical(msg) self.close(reason=msg, code=1002) return logger.info("worker '{}' submitted job result for job '{}'".format( self.worker_id, grading_job_id)) # clear the worker node's job worker_node.running_job_id = None worker_node_dao.update(worker_node) # finish the job job.finished_at = get_time() job.results = results job.success = success grading_job_dao.update(job) # store the logs job_log_dao = daos.GradingJobLogDao(self.settings) job_log = models.GradingJobLog(job_id=grading_job_id, **logs) job_log_dao.insert(job_log) # thread safe callback tornado.ioloop.IOLoop.current().add_callback(job_update_callback, self.settings, grading_job_id, job.run_id) # trigger schedule event tornado.ioloop.IOLoop.current().add_callback(worker_schedule_job, self.settings)