def _dispatch(self, request: tuple[str, ...]) -> str | Exception | None: request_type, *request_arg = request log.debug("got a request", request=request) try: if request_type == "start": # workers send us their slurm ID for us to fill in job_id, log_fname, job_name = request_arg kwargs = dict(job_id=job_id, log_fname=log_fname, job_name=job_name) # give the worker a job and send back the fname to the worker fname = self._start_request(**kwargs) if fname is None: raise RuntimeError( "No more learners to run in the database.") learner = next( learner for learner, f in zip(self.learners, self.fnames) if maybe_lst(f) == fname) log.debug("choose a fname", fname=fname, **kwargs) return learner, fname elif request_type == "stop": fname = request_arg[ 0] # workers send us the fname they were given log.debug("got a stop request", fname=fname) self._stop_request(fname) # reset the job_id to None return None except Exception as e: return e
def _stop_requests(self, fnames: list[str | list[str]]) -> None: # Same as `_stop_request` but optimized for processing many `fnames` at once fnames = {str(maybe_lst(fname)) for fname in fnames} with TinyDB(self.db_fname) as db: reset = dict(job_id=None, is_done=True, job_name=None) doc_ids = [e.doc_id for e in db.all() if str(e["fname"]) in fnames] db.update(reset, doc_ids=doc_ids)
def _stop_request(self, fname: str | list[str]) -> None: fname = maybe_lst(fname) # if a BalancingLearner Entry = Query() with TinyDB(self.db_fname) as db: reset = dict(job_id=None, is_done=True, job_name=None) assert (db.get(Entry.fname == fname) is not None) # make sure the entry exists db.update(reset, Entry.fname == fname)
def get_learner( url: str, log_fname: str, job_id: str, job_name: str ) -> tuple[BaseLearner, str | list[str]]: """Get a learner from the database running at `url` and this learner's process will be logged in `log_fname` and running under `job_id`. Parameters ---------- url : str The url of the database manager running via (`adaptive_scheduler.server_support.manage_database`). log_fname : str The filename of the log-file. Should be passed in the job-script. job_id : str The job_id of the process the job. Should be passed in the job-script. job_name : str The name of the job. Should be passed in the job-script. Returns ------- learner : `adaptive.BaseLearner` Learner that is chosen. fname : str The filename of the learner that was chosen. """ _add_log_file_handler(log_fname) log.info( "trying to get learner", job_id=job_id, log_fname=log_fname, job_name=job_name ) with ctx.socket(zmq.REQ) as socket: socket.setsockopt(zmq.LINGER, 0) socket.setsockopt(zmq.SNDTIMEO, 300_000) # timeout after 300s socket.connect(url) socket.send_serialized(("start", job_id, log_fname, job_name), _serialize) log.info("sent start signal, going to wait 60s for a reply.") socket.setsockopt(zmq.RCVTIMEO, 300_000) # timeout after 300s reply = socket.recv_serialized(_deserialize) log.info("got reply", reply=str(reply)) if reply is None: msg = "No learners to be run." exception = RuntimeError(msg) log_exception(log, msg, exception) raise exception elif isinstance(reply, Exception): log_exception(log, "got an exception", exception=reply) raise reply else: learner, fname = reply log.info("got fname and learner") log.info("picked a learner") return learner, maybe_lst(fname)