def get(self, *args, **kwargs): scope = kwargs.get("scope") worker_node_dao = daos.WorkerNodeDao(self.settings) if scope == "all": return { "worker_nodes": list( map( lambda worker_node: { "hostname": worker_node.hostname, "jobs_processed": worker_node.jobs_processed, "busy": (worker_node.running_job_id is not None), "alive": worker_node.is_alive, }, worker_node_dao.find_all(), )) } else: self.abort( { "message": "scope {} has not been implemented yet".format(scope) }, 404) return
def post(self, *args, **kwargs): worker_id = kwargs.get("worker_id") hostname = self.body.get("hostname") worker_node_dao = daos.WorkerNodeDao(self.settings) worker_node = models.WorkerNode(id_=worker_id, hostname=hostname, last_seen=get_time(), is_alive=True) dup = worker_node_dao.find_by_id(worker_id) if dup is None: logger.info("new worker {} joined on {}".format( worker_id, hostname)) worker_node_dao.insert(worker_node) elif not dup.is_alive: dup.is_alive = True logger.info("worker {} alive again on {}".format( worker_id, hostname)) worker_node_dao.update(dup) else: msg = "worker id '{}' already exists".format(worker_id) logger.info(msg) self.abort({"message": msg}, status=400) return return {"heartbeat": self.get_flags()["heartbeat_interval"]}
def post(self, *args, **kwargs): """ Allows workers to update grading job status on completion """ worker_id = kwargs.get("worker_id") job_id = self.body.get("grading_job_id") grading_job_dao = daos.GradingJobDao(self.settings) job = grading_job_dao.find_by_id(job_id) if not job: self.abort({"message": "job with the given ID not found"}) return job_state = job.get_state() if job_state != models.GradingJobState.STARTED: logger.critical( "job with id '{}' updated when in state '{}'".format( job_id, job_state.value)) self.abort( {"message": "cannot update job that is not in STARTED state"}) return worker_node_dao = daos.WorkerNodeDao(self.settings) worker_node = worker_node_dao.find_by_id(worker_id) if not worker_node: logger.critical( "unknown node with ID '{}' successfully updated job".format( worker_id)) self.abort({"message": ""}, status=404) return # clear the worker node's job worker_node.running_job_id = None worker_node.is_alive = True worker_node_dao.update(worker_node) # finish the job job.finished_at = get_time() job.results = self.body.get("results") job.success = self.body.get("success") grading_job_dao.update(job) # store the logs job_log_dao = daos.GradingJobLogDao(self.settings) job_log = models.GradingJobLog(job_id=job_id, **self.body.get("logs")) job_log_dao.insert(job_log) # thread safe callback tornado.ioloop.IOLoop.current().add_callback(job_update_callback, self.settings, job_id, job.run_id)
def post(self, *args, **kwargs): worker_id = kwargs.get("worker_id") worker_node_dao = daos.WorkerNodeDao(self.settings) worker_node = worker_node_dao.find_by_id(worker_id) if not worker_node: logger.critical( "unknown node with ID '{}' successfully sent heartbeat".format( worker_id)) self.abort({"message": ""}, status=404) return worker_node.last_seen = get_time() worker_node.is_alive = True worker_node_dao.update(worker_node)
def get(self, *args, **kwargs): """ Allows workers to request their next grading job """ worker_id = kwargs.get("worker_id") worker_node_dao = daos.WorkerNodeDao(self.settings) worker_node = worker_node_dao.find_by_id(worker_id) if not worker_node: logger.critical( "unknown node with ID '{}' successfully requested job".format( worker_id)) self.abort({"message": ""}, status=404) return try: grading_job_id = self.get_queue().pull() self.get_stream_queue().update_job_state( grading_job_id, models.GradingJobState.STARTED.name) self.get_queue().update_all_job_positions(self.get_stream_queue()) grading_job_dao = daos.GradingJobDao(self.settings) grading_job = grading_job_dao.find_by_id(grading_job_id) if not grading_job: logger.critical( "found job ID '{}' in queue, but job does not exist". format(grading_job_id)) self.abort( {"message": "a failure occurred while getting next job"}, status=500) return grading_job.started_at = get_time() grading_job.worker_id = worker_id grading_job_dao.update(grading_job) worker_node.running_job_id = grading_job_id worker_node.jobs_processed += 1 worker_node.is_alive = True worker_node_dao.update(worker_node) return { "grading_job_id": grading_job_id, "stages": grading_job.stages } except Empty: self.abort({"message": "no jobs available"}, status=498)
def handler_register(self, hostname): if self.worker_id is None: return worker_node_dao = daos.WorkerNodeDao(self.settings) dup = worker_node_dao.find_by_id(self.worker_id) if dup is None: self.worker_node = models.WorkerNode( id_=self.worker_id, hostname=hostname, last_seen=get_time(), is_alive=True, use_ws=True, ) logger.info("new worker '{}' joined on '{}'".format( self.worker_id, hostname)) worker_node_dao.insert(self.worker_node) elif not dup.is_alive: self.worker_node = dup self.worker_node.hostname = hostname self.worker_node.last_seen = get_time() self.worker_node.is_alive = True self.use_ws = True logger.info("worker '{}' alive again on '{}'".format( self.worker_id, hostname)) worker_node_dao.update(self.worker_node) else: msg = "worker id '{}' already exists".format(self.worker_id) logger.info(msg) self.send({"success": False}) self.close(reason=msg, code=1002) return self.registered = True self.get_ws_conn_map()[self.worker_id] = self self.send({"success": True}) # trigger schedule event tornado.ioloop.IOLoop.current().add_callback(worker_schedule_job, self.settings)
def on_ping(self, data): # ping messages have the same function as heartbeat requests # for normal http workers if self.worker_id is None: logger.critical("worker is not initialized") return worker_node_dao = daos.WorkerNodeDao(self.settings) worker_node = worker_node_dao.find_by_id(self.worker_id) if not worker_node: logger.critical( "unknown ws node with ID '{}' successfully sent heartbeat". format(self.worker_id)) return worker_node.last_seen = get_time() worker_node_dao.update(worker_node)
def handler_job_result(self, grading_job_id, success, results, logs): if not self.registered: logger.info("worker '{}' submitted before registering".format( self.worker_id)) self.close(reason="submitting before registering", code=1002) return grading_job_dao = daos.GradingJobDao(self.settings) job = grading_job_dao.find_by_id(grading_job_id) if not job: self.close(reason="job with the given ID not found", code=1002) return job_state = job.get_state() if job_state != models.GradingJobState.STARTED: logger.critical( "job with id '{}' updated when in state '{}'".format( grading_job_id, job_state.value)) self.close(reason="cannot update job that is not in STARTED state", code=1002) return worker_node_dao = daos.WorkerNodeDao(self.settings) worker_node = worker_node_dao.find_by_id(self.worker_id) if not worker_node: msg = "unknown worker '{}' successfully updated job".format( self.worker_id) logger.critical(msg) self.close(reason=msg, code=1002) return logger.info("worker '{}' submitted job result for job '{}'".format( self.worker_id, grading_job_id)) # clear the worker node's job worker_node.running_job_id = None worker_node_dao.update(worker_node) # finish the job job.finished_at = get_time() job.results = results job.success = success grading_job_dao.update(job) # store the logs job_log_dao = daos.GradingJobLogDao(self.settings) job_log = models.GradingJobLog(job_id=grading_job_id, **logs) job_log_dao.insert(job_log) # thread safe callback tornado.ioloop.IOLoop.current().add_callback(job_update_callback, self.settings, grading_job_id, job.run_id) # trigger schedule event tornado.ioloop.IOLoop.current().add_callback(worker_schedule_job, self.settings)